pray-calc-ml/src/pipeline.py
Aric Camarata 1c8187cfc4 data: deduplicate dataset — 35 Fajr + 1 Isha duplicates removed
Identified three sources of cross-source duplication and fixed each:

1. Kassim Bahali 2018 Pekan Pahang (9 records)
   Same 9 June-July 2017 DSLR observations existed in both
   verified_sightings.py (Table 2 entries) and the raw CSV
   kassim_bahali_2017_malaysia.csv. Removed from verified_sightings;
   raw CSV is the canonical source with richer cloud/conditions notes.

2. BRIN Mount Timau SQM dataset (22 records)
   timau_sqm_fajr.csv contained two SQM threshold readings per night:
   target=18.0° (75 records, primary) and target=16.51° (22 records,
   derived from the 75-night mean). Removed target=16.51 rows.
   Each night now has exactly one Fajr time.

3. Khalifa 2018 Hail Fajr (4 records)
   Original batch had times producing implausible angles: 2015-01-15
   gave 12.6° and 2015-06-21 gave 19.3° (paper reports 14.014°±0.317°).
   Removed the four bad-time records. Batch 16a replacements (computed
   from the paper mean D0) remain and give consistent 13.9-14.1° angles.

Pipeline: add automatic deduplication guard. After combining all sources,
any (prayer, date, lat rounded to 3dp, lng rounded to 3dp) duplicate is
logged and dropped (keep first). This prevents future cross-source overlaps
from silently inflating the dataset or training on the same observation twice.

Dataset: fajr_angles.csv 4535 records, isha_angles.csv 120 records
Zero duplicates confirmed.
2026-02-26 05:13:28 -05:00

235 lines
8.5 KiB
Python

"""
Master data pipeline.
Runs all collectors, back-calculates solar depression angles for each verified
sighting, optionally looks up missing elevations, and writes two clean CSVs:
data/processed/fajr_angles.csv
data/processed/isha_angles.csv
Each row represents ONE confirmed human-verified sighting.
Columns:
date - YYYY-MM-DD (local calendar date)
utc_dt - ISO 8601 UTC datetime of the sighting
lat - decimal degrees (north positive)
lng - decimal degrees (east positive)
elevation_m - metres above sea level
fajr_angle - solar depression angle at moment of Fajr sighting (degrees)
isha_angle - solar depression angle at moment of Isha sighting (degrees)
day_of_year - 1-366 (for seasonality / TOY analysis)
source - citation string
notes - observer notes
Usage:
python -m src.pipeline [--no-elevation-lookup]
--no-elevation-lookup : skip Open-Elevation API calls (use 0 for unknowns)
"""
import argparse
import sys
import os
from pathlib import Path
from datetime import timezone
import pandas as pd
# Add project root to path
ROOT = Path(__file__).parent.parent
sys.path.insert(0, str(ROOT))
from src.angle_calc import depression_angle
from src.collect.openfajr import fetch_openfajr
from src.collect.verified_sightings import load_verified_sightings
from src.elevation import get_elevations_batch
from src.ingest import ingest_all_raw_csvs
PROCESSED_DIR = ROOT / "data" / "processed"
def _raw_to_df(records: list[dict]) -> pd.DataFrame:
"""Convert a list of standardized raw record dicts to a DataFrame."""
from datetime import datetime, timedelta
rows = []
for r in records:
try:
dt_local = datetime.strptime(
f"{r['date_local']} {r['time_local']}", "%Y-%m-%d %H:%M"
)
utc_offset = float(r.get("utc_offset", 0))
utc_dt = dt_local - timedelta(hours=utc_offset)
rows.append({
"prayer": r["prayer"],
"date": r["date_local"],
"utc_dt": utc_dt,
"lat": float(r["lat"]),
"lng": float(r["lng"]),
"elevation_m": float(r.get("elevation_m") or 0),
"source": r.get("source", ""),
"notes": r.get("notes", ""),
})
except Exception as e:
import logging
logging.getLogger(__name__).warning("Skipping raw record: %s%s", r, e)
return pd.DataFrame(rows)
def build_dataset(
lookup_elevation: bool = True,
) -> tuple[pd.DataFrame, pd.DataFrame]:
"""
Run all collectors, compute depression angles, return (fajr_df, isha_df).
"""
print("Loading OpenFajr Birmingham iCal feed...")
openfajr_df = fetch_openfajr()
print(f" {len(openfajr_df)} Fajr records from OpenFajr")
print("Loading manually verified sightings...")
manual_df = load_verified_sightings()
print(f" {len(manual_df)} manually compiled records")
print("Loading ingested raw CSV sightings...")
raw_records = ingest_all_raw_csvs(lookup_elevation=False)
raw_df = _raw_to_df(raw_records)
if len(raw_df) > 0:
print(f" {len(raw_df)} records from raw CSVs")
else:
print(" 0 raw CSV records found")
all_df = pd.concat([openfajr_df, manual_df, raw_df], ignore_index=True)
# Deduplicate: same prayer + same date + same lat/lng (rounded to 3 decimal
# places, ~111m) should produce identical angles. Keep the first occurrence
# and log any removed records so cross-source overlaps are visible.
all_df["_lat_r"] = all_df["lat"].round(3)
all_df["_lng_r"] = all_df["lng"].round(3)
dup_mask = all_df.duplicated(subset=["prayer", "date", "_lat_r", "_lng_r"], keep="first")
if dup_mask.any():
print(f" Deduplicating {dup_mask.sum()} cross-source duplicate(s) "
f"(same prayer+date+location):")
for _, row in all_df[dup_mask].iterrows():
print(f" {row['prayer'].upper()} {row['date']} "
f"lat={row['lat']:.3f} lng={row['lng']:.3f}{row['source']}")
all_df = all_df[~dup_mask].copy()
all_df = all_df.drop(columns=["_lat_r", "_lng_r"])
# Elevation lookup for records with elevation_m == 0
if lookup_elevation:
missing_mask = all_df["elevation_m"] == 0.0
n_missing = missing_mask.sum()
if n_missing > 0:
print(f"Looking up elevations for {n_missing} records...")
locs = list(zip(
all_df.loc[missing_mask, "lat"],
all_df.loc[missing_mask, "lng"],
))
elevations = get_elevations_batch(locs)
all_df.loc[missing_mask, "elevation_m"] = elevations
print(f" Elevation lookup complete")
else:
print("Skipping elevation lookup (--no-elevation-lookup)")
# Back-calculate depression angle for each sighting
print("Computing solar depression angles...")
angles = []
for _, row in all_df.iterrows():
try:
angle = depression_angle(
row["utc_dt"],
row["lat"],
row["lng"],
row["elevation_m"],
)
except Exception as e:
angle = float("nan")
angles.append(angle)
all_df["angle"] = angles
# Drop records with implausible depression angles — data entry / timing errors.
# Floor thresholds based on the full body of peer-reviewed sighting research:
# Fajr: no confirmed genuine sighting below 7° depression
# Isha: no confirmed genuine sighting below 10° depression
# These also catch: sun-above-horizon (negative), DST clock-change artifacts,
# and mis-estimated observation times that ended up too close to sunrise/sunset.
FAJR_MIN_DEG = 7.0
ISHA_MIN_DEG = 10.0
fajr_bad = (all_df["prayer"] == "fajr") & (all_df["angle"] < FAJR_MIN_DEG)
isha_bad = (all_df["prayer"] == "isha") & (all_df["angle"] < ISHA_MIN_DEG)
bad = fajr_bad | isha_bad | all_df["angle"].isna()
if bad.any():
print(f" Dropping {bad.sum()} record(s) with implausible angles "
f"(< {FAJR_MIN_DEG}° Fajr / < {ISHA_MIN_DEG}° Isha):")
for _, row in all_df[bad].iterrows():
print(f" {row['prayer'].upper()} {row['date']} {row['utc_dt']} "
f"lat={row['lat']:.2f} angle={row['angle']:.2f}° — {row['source']}")
all_df = all_df[~bad].copy()
# Add seasonality feature
all_df["day_of_year"] = all_df["utc_dt"].apply(
lambda dt: dt.timetuple().tm_yday
)
# Split into Fajr and Isha datasets
fajr_df = all_df[all_df["prayer"] == "fajr"].copy()
isha_df = all_df[all_df["prayer"] == "isha"].copy()
fajr_df = fajr_df.rename(columns={"angle": "fajr_angle"})
isha_df = isha_df.rename(columns={"angle": "isha_angle"})
# Final column order for ML
fajr_cols = ["date", "utc_dt", "lat", "lng", "elevation_m",
"day_of_year", "fajr_angle", "source", "notes"]
isha_cols = ["date", "utc_dt", "lat", "lng", "elevation_m",
"day_of_year", "isha_angle", "source", "notes"]
fajr_df = fajr_df[fajr_cols].sort_values(["lat", "day_of_year"])
isha_df = isha_df[isha_cols].sort_values(["lat", "day_of_year"])
return fajr_df, isha_df
def main():
parser = argparse.ArgumentParser(description="Build Fajr/Isha angle datasets")
parser.add_argument(
"--no-elevation-lookup",
action="store_true",
help="Skip Open-Elevation API calls",
)
args = parser.parse_args()
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
fajr_df, isha_df = build_dataset(
lookup_elevation=not args.no_elevation_lookup,
)
fajr_path = PROCESSED_DIR / "fajr_angles.csv"
isha_path = PROCESSED_DIR / "isha_angles.csv"
fajr_df.to_csv(fajr_path, index=False)
isha_df.to_csv(isha_path, index=False)
print(f"\nFajr dataset: {len(fajr_df)} records → {fajr_path}")
print(f"Isha dataset: {len(isha_df)} records → {isha_path}")
print("\nFajr angle stats:")
print(fajr_df["fajr_angle"].describe().to_string())
print("\nIsha angle stats:")
if len(isha_df) > 0:
print(isha_df["isha_angle"].describe().to_string())
print("\nFajr geographic coverage:")
print(f" Latitude range: {fajr_df['lat'].min():.1f}° to {fajr_df['lat'].max():.1f}°")
print(f" Unique locations: {len(fajr_df.groupby(['lat','lng']))}")
dates = fajr_df["date"].astype(str)
print(f" Date range: {dates.min()} to {dates.max()}")
if __name__ == "__main__":
main()