mirror of
https://github.com/acamarata/pray-calc-ml.git
synced 2026-07-01 19:34:26 +00:00
Add 6 new data collection pipelines and their processed outputs: Sources added: - TESS/Stars4All photometer network: 37 months (Jun 2017-Aug 2020), ~40k raw events from 100+ European stations via Zenodo archives - Globe at Night citizen science: 26k twilight observations (2006-2024), filtered from 308k total observations for solar depression 6-22 deg - GaN-MN continuous monitoring: 45 months (Jan 2022-Sep 2025), ~12.5k twilight events from 88 stations across 20+ countries - Galicia SQM network: 14 stations, 1-min resolution, 7.5k events - Madrid/Majadahonda SQM: multi-year continuous monitoring, 3.1k events - washetdonker.nl Netherlands: 7 stations, 3.3k morning events - Academic papers: Jordan (Abed 2015), Fayum Egypt, India photometer Pipeline changes: - ingest.py: add all new files to APPROVED_RAW_CSVS allowlist, fix filter to use allowlist instead of hardcoded exclusions - .gitignore: exclude bulk raw data directories (BSRN, TESS, GaN-MN, washetdonker, Globe at Night downloads) Final dataset: 56,668 Fajr + 34,763 Isha = 91,431 total records Previous: 5,871 Fajr + 46 Isha = 5,917 total records
202 lines
6.1 KiB
Python
202 lines
6.1 KiB
Python
"""
|
|
Process Majadahonda SQM 2019 tesstractor data.
|
|
|
|
Station: Majadahonda, Madrid, Spain
|
|
Coordinates: 40.469°N, 3.863°W
|
|
Elevation: ~700m (Majadahonda suburb, 700m asl approx)
|
|
Timezone: Europe/Madrid (UTC+1/+2)
|
|
Format: ;-separated
|
|
col0: UTC timestamp (ISO 8601)
|
|
col1: local timestamp
|
|
col2: integration time (99.0 = 5 min)
|
|
col3: temperature °C
|
|
col4: frequency Hz
|
|
col5: MSAS magnitude (mag/arcsec²) — 0.00 means daylight/overexposed
|
|
col6: calibration MSAS
|
|
|
|
Source: https://zenodo.org/records/5709962
|
|
DOI: 10.5281/zenodo.5709962
|
|
|
|
Processing:
|
|
- Parse all .dat files (5-minute cadence, tesstractor format)
|
|
- For each night, compute solar depression angle per reading using PyEphem
|
|
- Find morning twilight window: depression 10°-20°, MSAS dropping (sky brightening)
|
|
- Find evening twilight window: depression 10°-20°, MSAS rising (sky darkening)
|
|
- Extract inflection point (maximum |d(MSAS)/dt|) as the "twilight event"
|
|
- Output one row per event
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import datetime
|
|
import ephem
|
|
import pandas as pd
|
|
import numpy as np
|
|
|
|
LAT = 40.469
|
|
LON = -3.863
|
|
ELEV = 700.0
|
|
UTC_OFFSET_WINTER = 1 # CET
|
|
UTC_OFFSET_SUMMER = 2 # CEST
|
|
|
|
DAT_DIR = "/Volumes/X9/Sites/acamarata/pray-calc-ml/data/raw/majadahonda_sqm_2019/tesstractor"
|
|
OUT_CSV = "/Volumes/X9/Sites/acamarata/pray-calc-ml/data/raw/raw_sightings/majadahonda_2019_sqm.csv"
|
|
SOURCE = "Majadahonda_SQM_2019_Zenodo5709962"
|
|
|
|
DEP_MIN = 10.0
|
|
DEP_MAX = 20.0
|
|
|
|
|
|
def compute_solar_depression(utc_dt: datetime.datetime) -> float:
|
|
obs = ephem.Observer()
|
|
obs.lat = str(LAT)
|
|
obs.lon = str(LON)
|
|
obs.elev = ELEV
|
|
obs.pressure = 0
|
|
obs.epoch = ephem.J2000
|
|
obs.date = utc_dt.strftime("%Y/%m/%d %H:%M:%S")
|
|
sun = ephem.Sun()
|
|
sun.compute(obs)
|
|
alt_deg = float(sun.alt) * 180.0 / ephem.pi
|
|
return -alt_deg
|
|
|
|
|
|
def parse_dat_file(fpath: str) -> pd.DataFrame:
|
|
rows = []
|
|
with open(fpath, "r", encoding="utf-8", errors="replace") as f:
|
|
for line in f:
|
|
line = line.strip()
|
|
if not line or line.startswith("#"):
|
|
continue
|
|
parts = line.split(";")
|
|
if len(parts) < 7:
|
|
continue
|
|
try:
|
|
utc_str = parts[0]
|
|
msas_raw = float(parts[6])
|
|
# Remove sub-second part for parsing
|
|
utc_str_clean = re.sub(r"\.\d+$", "", utc_str)
|
|
utc_dt = datetime.datetime.strptime(utc_str_clean, "%Y-%m-%dT%H:%M:%S")
|
|
rows.append({"utc_dt": utc_dt, "msas": msas_raw})
|
|
except (ValueError, IndexError):
|
|
continue
|
|
if not rows:
|
|
return pd.DataFrame()
|
|
df = pd.DataFrame(rows)
|
|
df = df.sort_values("utc_dt").reset_index(drop=True)
|
|
return df
|
|
|
|
|
|
def find_twilight_events(df: pd.DataFrame) -> list:
|
|
"""
|
|
For a night's data, find morning and evening twilight inflection points.
|
|
Returns list of dicts with event details.
|
|
"""
|
|
if df.empty:
|
|
return []
|
|
|
|
events = []
|
|
|
|
# Compute solar depression for each row
|
|
depressions = []
|
|
for _, row in df.iterrows():
|
|
dep = compute_solar_depression(row["utc_dt"])
|
|
depressions.append(dep)
|
|
df = df.copy()
|
|
df["depression"] = depressions
|
|
|
|
# Filter to twilight window only (valid MSAS > 5, depression in range)
|
|
mask = (
|
|
(df["depression"] >= DEP_MIN)
|
|
& (df["depression"] <= DEP_MAX)
|
|
& (df["msas"] > 5.0)
|
|
)
|
|
twi = df[mask].copy()
|
|
if len(twi) < 4:
|
|
return []
|
|
|
|
# Compute rate of change of MSAS
|
|
twi = twi.sort_values("utc_dt").reset_index(drop=True)
|
|
twi["dmsas"] = twi["msas"].diff()
|
|
|
|
# Separate morning (depression decreasing over time = sun rising)
|
|
# and evening (depression increasing over time = sun setting)
|
|
twi["ddep"] = twi["depression"].diff()
|
|
|
|
# Morning: depression decreasing (sun rising), MSAS dropping (sky brightening)
|
|
morning = twi[twi["ddep"] < 0].copy()
|
|
evening = twi[twi["ddep"] > 0].copy()
|
|
|
|
def extract_event(segment: pd.DataFrame, prayer: str) -> dict | None:
|
|
if len(segment) < 3:
|
|
return None
|
|
# Find max absolute rate of change
|
|
segment = segment.copy()
|
|
segment["abs_dmsas"] = segment["dmsas"].abs()
|
|
peak_idx = segment["abs_dmsas"].idxmax()
|
|
peak_row = segment.loc[peak_idx]
|
|
|
|
utc_dt = peak_row["utc_dt"]
|
|
depression = peak_row["depression"]
|
|
date_local = utc_dt.date()
|
|
|
|
# Estimate local time offset (rough: Spain UTC+1 winter, +2 summer)
|
|
month = utc_dt.month
|
|
utc_off = UTC_OFFSET_SUMMER if 4 <= month <= 10 else UTC_OFFSET_WINTER
|
|
local_dt = utc_dt + datetime.timedelta(hours=utc_off)
|
|
|
|
return {
|
|
"prayer": prayer,
|
|
"date_local": local_dt.strftime("%Y-%m-%d"),
|
|
"time_local": local_dt.strftime("%H:%M:%S"),
|
|
"utc_offset": utc_off,
|
|
"lat": LAT,
|
|
"lng": LON,
|
|
"elevation_m": ELEV,
|
|
"source": SOURCE,
|
|
"notes": f"sqm_msas={peak_row['msas']:.2f},solar_dep={depression:.2f}deg,inflection_method",
|
|
}
|
|
|
|
e = extract_event(morning, "fajr")
|
|
if e:
|
|
events.append(e)
|
|
e = extract_event(evening, "isha")
|
|
if e:
|
|
events.append(e)
|
|
|
|
return events
|
|
|
|
|
|
def main():
|
|
all_events = []
|
|
dat_files = sorted(
|
|
f for f in os.listdir(DAT_DIR) if f.endswith(".dat")
|
|
)
|
|
print(f"Processing {len(dat_files)} .dat files...")
|
|
|
|
for fname in dat_files:
|
|
fpath = os.path.join(DAT_DIR, fname)
|
|
df = parse_dat_file(fpath)
|
|
if df.empty:
|
|
continue
|
|
events = find_twilight_events(df)
|
|
all_events.extend(events)
|
|
if events:
|
|
print(f" {fname}: {len(events)} events")
|
|
|
|
if not all_events:
|
|
print("No events found.")
|
|
return
|
|
|
|
out_df = pd.DataFrame(all_events, columns=[
|
|
"prayer", "date_local", "time_local", "utc_offset",
|
|
"lat", "lng", "elevation_m", "source", "notes"
|
|
])
|
|
out_df = out_df.sort_values(["date_local", "prayer"]).reset_index(drop=True)
|
|
out_df.to_csv(OUT_CSV, index=False)
|
|
print(f"\nWrote {len(out_df)} rows to {OUT_CSV}")
|
|
print(out_df["prayer"].value_counts().to_string())
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|