chore: add remaining processors and analysis scripts, gitignore experimental

Tracked: BSRN/SURFRAD processors (reference, excluded from pipeline), GaN-MN downloader, academic paper fetcher, Madrid SQM processor, ML analysis scripts (src/analyze/), umsu_medan_2024 raw sightings. Gitignored: global_extrapolator, instant_1m_injector/vectorized, massive_harvest_engine, massive_sqm_downloader, global_sqm_harvester, run_infinite_pipeline.sh, run_massive_collection.sh, search_papers.py (agent-generated experimental scripts, not part of core pipeline).
2026-06-30 19:04:26 +00:00 · 2026-03-23 06:44:01 -04:00 · 2026-03-23 06:44:01 -04:00 · 3b8c665aca
commit 3b8c665aca
parent ad8f1c4bc7
17 changed files with 2863 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -93,4 +93,13 @@ src/collect/openalex_harvester.py
 src/collect/source_tracker.py
 src/collect/waktusolat.py
 src/collect/web_harvester.py
+src/collect/global_extrapolator.py
+src/collect/instant_1m_injector.py
+src/collect/instant_1m_vectorized.py
+src/collect/massive_harvest_engine.py
+src/collect/massive_sqm_downloader.py
+src/collect/global_sqm_harvester.py
+run_infinite_pipeline.sh
+run_massive_collection.sh
+search_papers.py
 .vscode/*
--- a/data/raw/raw_sightings/umsu_medan_2024.csv
+++ b/data/raw/raw_sightings/umsu_medan_2024.csv
@ -0,0 +1,4 @@
+prayer,date_local,time_local,utc_offset,lat,lng,elevation_m,source,notes
+fajr,2023-06-21,05:12,7.0,3.595,98.672,22.0,"UMSU Medan Observatory 2023-2024 SQM Data","SQM confirmed Fajar Sadiq; published 2025"
+fajr,2023-12-21,05:22,7.0,3.595,98.672,22.0,"UMSU Medan Observatory 2023-2024 SQM Data","SQM confirmed Fajar Sadiq; winter 2023"
+fajr,2024-03-20,05:16,7.0,3.595,98.672,22.0,"UMSU Medan Observatory 2023-2024 SQM Data","SQM confirmed Fajar Sadiq; spring equinox 2024"
--- a/src/analyze/compare.py
+++ b/src/analyze/compare.py
@ -0,0 +1,57 @@
+import pandas as pd
+import numpy as np
+import os
+
+def dpc_algorithm(row, prayer):
+    # DPC strictly enforces 18.0 degrees for both
+    return 18.0
+
+def _ms_fajr(lat):
+    # Approximation of Khalid Shaukat moonsighting algorithm curve
+    # 18 at equator, dropping incrementally past 30 deg to ~12.0
+    abslat = abs(lat)
+    if abslat <= 30:
+        return 18.0
+    else:
+        # Linear drop from 18.0 at 30 deg down to 12.0 at 55 deg
+        val = 18.0 - ((abslat - 30) * (6.0/25.0))
+        return max(10.0, val)
+        
+def ms_algorithm(row, prayer):
+    if prayer == "fajr":
+        return _ms_fajr(row['lat'])
+    else:
+        # Isha is slightly more forgiving or static
+        return _ms_fajr(row['lat'])
+
+def run_evaluation(df, prayer):
+    print(f"\n>> Algorithm Benchmark: {prayer.upper()} ({len(df)} Records)")
+    empirical = df[f'{prayer}_angle'].values
+    
+    dpc_preds = np.array([dpc_algorithm(row, prayer) for _, row in df.iterrows()])
+    ms_preds  = np.array([ms_algorithm(row, prayer) for _, row in df.iterrows()])
+    
+    dpc_mae = np.mean(np.abs(empirical - dpc_preds))
+    dpc_rmse = np.sqrt(np.mean((empirical - dpc_preds)**2))
+    
+    ms_mae = np.mean(np.abs(empirical - ms_preds))
+    ms_rmse = np.sqrt(np.mean((empirical - ms_preds)**2))
+    
+    print(f"DPC (18/18 static)         -> MAE: {dpc_mae:.3f}°, RMSE: {dpc_rmse:.3f}°")
+    print(f"Moonsighting (Variable)    -> MAE: {ms_mae:.3f}°,  RMSE: {ms_rmse:.3f}°")
+    
+    best = "DPC" if dpc_mae < ms_mae else "Moonsighting Variant"
+    print(f"WINNER: {best} predicts the physical reality more accurately.")
+
+def main():
+    base_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+    fajr_path = os.path.join(base_dir, "data", "processed", "fajr_angles.csv")
+    isha_path = os.path.join(base_dir, "data", "processed", "isha_angles.csv")
+    
+    if os.path.exists(fajr_path):
+        run_evaluation(pd.read_csv(fajr_path), "fajr")
+    if os.path.exists(isha_path):
+        run_evaluation(pd.read_csv(isha_path), "isha")
+
+if __name__ == "__main__":
+    main()
--- a/src/analyze/find_best_formula.py
+++ b/src/analyze/find_best_formula.py
@ -0,0 +1,99 @@
+import pandas as pd
+import numpy as np
+import warnings
+from sklearn.linear_model import LinearRegression, Ridge
+from sklearn.preprocessing import PolynomialFeatures
+from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
+from sklearn.metrics import mean_absolute_error, r2_score
+from sklearn.pipeline import make_pipeline
+
+warnings.filterwarnings("ignore")
+
+def find_best_formula(df, name):
+    print(f"\n=======================================================")
+    print(f"   {name.upper()} EXHAUSTIVE ML FORMULA SEARCH   ")
+    print(f"=======================================================")
+    
+    # Filter missing values
+    angle_col = f"{name.lower()}_angle"
+    df = df.dropna(subset=[angle_col, 'lat', 'day_of_year', 'elevation_m'])
+    
+    # Target and Features restricted to Mobile-App-Available variables
+    y = df[angle_col].values
+    
+    # Features
+    lat_abs = df['lat'].abs().values
+    elev = df['elevation_m'].values
+    doy = df['day_of_year'].values
+    doy_sin = np.sin(2 * np.pi * doy / 365.25)
+    doy_cos = np.cos(2 * np.pi * doy / 365.25)
+    
+    X = np.column_stack((lat_abs, doy_sin, doy_cos, elev))
+    feature_names = ['Lat', 'Season_Sin', 'Season_Cos', 'Elevation']
+    
+    print(f"Dataset Size: {len(y)} verified rows")
+    print(f"Base Average (15° model): MAE = {mean_absolute_error(y, np.full_like(y, 15.0)):.3f}°\n")
+    
+    # --- 1. RANDOM FOREST (The Black Box Upper Bound) ---
+    # This tells us the ABSOLUTE maximum achievable accuracy if we had a perfect infinite formula
+    rf = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
+    rf.fit(X, y)
+    rf_preds = rf.predict(X)
+    print(f"[1] Random Forest (Non-Linear Black Box)")
+    print(f"    R2 = {r2_score(y, rf_preds):.4f} | MAE = {mean_absolute_error(y, rf_preds):.4f}°")
+    print(f"    Feature Importance: Lat={rf.feature_importances_[0]:.2f}, Season_Sin={rf.feature_importances_[1]:.2f}, Season_Cos={rf.feature_importances_[2]:.2f}, Elev={rf.feature_importances_[3]:.2f}\n")
+    
+    # --- 2. GRADIENT BOOSTING (Sequential Error Correction) ---
+    gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
+    gb.fit(X, y)
+    gb_preds = gb.predict(X)
+    print(f"[2] Gradient Boosting (XGBoost logic)")
+    print(f"    R2 = {r2_score(y, gb_preds):.4f} | MAE = {mean_absolute_error(y, gb_preds):.4f}°\n")
+    
+    # --- 3. POLYNOMIAL REGRESSION (Degree 2) - The "Clean Formula" ---
+    # This gives us a highly complex mathematical equation we can actually deploy in a mobile app
+    poly2 = PolynomialFeatures(degree=2, include_bias=False)
+    X_poly2 = poly2.fit_transform(X)
+    lr2 = Ridge(alpha=1.0).fit(X_poly2, y)
+    poly2_preds = lr2.predict(X_poly2)
+    
+    print(f"[3] 2nd Degree Polynomial Formula (Clean Math Equation)")
+    print(f"    R2 = {r2_score(y, poly2_preds):.4f} | MAE = {mean_absolute_error(y, poly2_preds):.4f}°")
+    
+    # --- 4. POLYNOMIAL REGRESSION (Degree 3) - The "Advanced Clean Formula" ---
+    poly3 = PolynomialFeatures(degree=3, include_bias=False)
+    X_poly3 = poly3.fit_transform(X)
+    lr3 = Ridge(alpha=5.0).fit(X_poly3, y)
+    poly3_preds = lr3.predict(X_poly3)
+    
+    print(f"[4] 3rd Degree Polynomial Formula (Advanced Math Equation)")
+    print(f"    R2 = {r2_score(y, poly3_preds):.4f} | MAE = {mean_absolute_error(y, poly3_preds):.4f}°\n")
+    
+    # Output the BEST mathematical formula (We'll use Degree 2 for brevity and readability)
+    print(f"--- BEST EXTRACTABLE EQUATION (Polynomial Degree 2) ---")
+    intercept = lr2.intercept_
+    coefs = lr2.coef_
+    poly_names = poly2.get_feature_names_out(feature_names)
+    
+    equation_str = f"Angle = {intercept:.4f}\n"
+    for name, c in zip(poly_names, coefs):
+        if abs(c) > 0.0001:  # Only show meaningful weights
+            sign = "+" if c > 0 else "-"
+            # Format nicely
+            name = name.replace(" ", " * ")
+            equation_str += f"        {sign} {abs(c):.4f} * ({name})\n"
+            
+    print(equation_str)
+
+if __name__ == '__main__':
+    try:
+        fajr_df = pd.read_csv('data/processed/fajr_angles.csv')
+        find_best_formula(fajr_df, 'Fajr')
+    except Exception as e:
+        print("Error Fajr:", e)
+        
+    try:
+        isha_df = pd.read_csv('data/processed/isha_angles.csv')
+        find_best_formula(isha_df, 'Isha')
+    except Exception as e:
+        print("Error Isha:", e)
--- a/src/analyze/find_perfect_formula.py
+++ b/src/analyze/find_perfect_formula.py
@ -0,0 +1,98 @@
+import pandas as pd
+import numpy as np
+from sklearn.linear_model import RidgeCV, LassoCV, LinearRegression
+from sklearn.preprocessing import StandardScaler
+from sklearn.metrics import r2_score, mean_absolute_error
+import warnings
+warnings.filterwarnings('ignore')
+
+def find_perfect_formula(prayer_name, csv_file):
+    print(f"\n{'='*60}")
+    print(f" ALGEBRAIC FORMULA EXTRACTION: {prayer_name.upper()}")
+    print(f"{'='*60}")
+    
+    df = pd.read_csv(csv_file)
+    target = f"{prayer_name}_angle"
+    df = df.dropna(subset=['lat', 'day_of_year', 'elevation_m', target])
+    
+    # Base readily available 'pray-calc' variables
+    df['lat_abs'] = df['lat'].abs()
+    df['lat_sq'] = df['lat'] ** 2
+    df['elev_sqrt'] = np.sqrt(df['elevation_m'].abs())
+    
+    # Sine/Cosine for Day of Year (Seasonality cyclic mapping)
+    # The longest twilight is around solstice, shortest around equinox,
+    # so we use a 1-year frequency harmonic.
+    df['doy_sin'] = np.sin(2 * np.pi * df['day_of_year'] / 365.25)
+    df['doy_cos'] = np.cos(2 * np.pi * df['day_of_year'] / 365.25)
+    
+    # Interaction
+    df['lat_x_season'] = df['lat_abs'] * df['doy_cos']
+    
+    features = ['lat_abs', 'lat_sq', 'elev_sqrt', 'doy_sin', 'doy_cos', 'lat_x_season']
+    X = df[features]
+    y = df[target]
+    
+    # 1. Standardize (just to see which features matter most)
+    scaler = StandardScaler()
+    X_scaled = scaler.fit_transform(X)
+    
+    # Use Lasso to zero-out useless variables (Sparsity for a "clean formula")
+    lasso = LassoCV(cv=5)
+    lasso.fit(X_scaled, y)
+    
+    # Filter features that survived Lasso
+    surviving_idx = [i for i, coef in enumerate(lasso.coef_) if abs(coef) > 0.001]
+    if not surviving_idx: # Fallback if everything zeroed out
+        surviving_idx = [0, 1]
+    
+    surviving_features = [features[i] for i in surviving_idx]
+    
+    # 2. Refit Linear Regression on strictly the surviving features natively (unscaled) for exact formula constants!
+    X_clean = df[surviving_features]
+    lr = LinearRegression()
+    lr.fit(X_clean, y)
+    
+    y_pred = lr.predict(X_clean)
+    r2 = r2_score(y, y_pred)
+    mae = mean_absolute_error(y, y_pred)
+    
+    print(f"Algorithm Selected: Clean Sparse Polynomial")
+    print(f"Targeting: Latitude, Seasonality Harmonics, Elevation")
+    print(f"Mathematical Accuracy: MAE = {mae:.3f} degrees | R^2 = {r2:.4f}")
+    
+    print("\n--- THE PERFECT ALGEBRAIC FORMULA ---")
+    
+    formula = f"D0_base = {lr.intercept_:.4f}"
+    for feat, coef in zip(surviving_features, lr.coef_):
+        sign = "+" if coef >= 0 else "-"
+        formula += f"\n          {sign} {abs(coef):.6f} * {feat}"
+        
+    print(formula)
+    
+    print("\n--- COPY-PASTE PYTHON IMPLEMENTATION ---")
+    print(f"def get_{prayer_name}_angle(lat, day_of_year, elevation_m):")
+    print(f"    import numpy as np")
+    print(f"    import math")
+    print(f"    lat_abs = abs(lat)")
+    if 'lat_sq' in surviving_features:
+        print(f"    lat_sq = lat ** 2")
+    if 'elev_sqrt' in surviving_features:
+        print(f"    elev_sqrt = math.sqrt(max(0, elevation_m))")
+    if 'doy_sin' in surviving_features:
+        print(f"    doy_sin = math.sin(2 * math.pi * day_of_year / 365.25)")
+    if 'doy_cos' in surviving_features:
+        print(f"    doy_cos = math.cos(2 * math.pi * day_of_year / 365.25)")
+    if 'lat_x_season' in surviving_features:
+        print(f"    lat_x_season = lat_abs * doy_cos")
+        
+    eq = f"    return {lr.intercept_:.4f}"
+    for feat, coef in zip(surviving_features, lr.coef_):
+        sign = "+" if coef >= 0 else "-"
+        eq += f" {sign} ({abs(coef):.6f} * {feat})"
+    print(eq)
+    print("="*60)
+
+if __name__ == '__main__':
+    find_perfect_formula("fajr", "data/processed/fajr_angles.csv")
+    find_perfect_formula("isha", "data/processed/isha_angles.csv")
--- a/src/analyze/ml_analysis.py
+++ b/src/analyze/ml_analysis.py
@ -0,0 +1,73 @@
+import pandas as pd
+import numpy as np
+from sklearn.linear_model import LinearRegression
+from sklearn.preprocessing import PolynomialFeatures
+from sklearn.pipeline import make_pipeline
+from sklearn.metrics import r2_score, mean_absolute_error
+from sklearn.ensemble import RandomForestRegressor
+import warnings
+warnings.filterwarnings('ignore')
+
+def analyze_prayer(prayer_name, csv_file):
+    print(f"\n{'='*50}")
+    print(f" ML Analysis: {prayer_name.upper()}")
+    print(f"{'='*50}")
+    
+    df = pd.read_csv(csv_file)
+    target = f"{prayer_name}_angle"
+    
+    # Drop NaNs
+    df = df.dropna(subset=['lat', 'day_of_year', target])
+    
+    X = df[['lat', 'day_of_year']]
+    y = df[target]
+    
+    print(f"Total Rows: {len(df)}")
+    
+    # 1. Standard Linear Regression
+    lr = LinearRegression()
+    lr.fit(X, y)
+    lr_pred = lr.predict(X)
+    lr_r2 = r2_score(y, lr_pred)
+    lr_mae = mean_absolute_error(y, lr_pred)
+    
+    print("\n--- Model 1: Linear Regression ---")
+    print(f"R2: {lr_r2:.4f} | MAE: {lr_mae:.4f} degrees")
+    print(f"Formula: Angle = {lr.intercept_:.4f} + ({lr.coef_[0]:.4f} * lat) + ({lr.coef_[1]:.4f} * day_of_year)")
+    
+    # 2. Polynomial Regression (Degree 2 - captures exponential/quadratic non-linear relationships)
+    poly = make_pipeline(PolynomialFeatures(degree=2), LinearRegression())
+    poly.fit(X, y)
+    poly_pred = poly.predict(X)
+    poly_r2 = r2_score(y, poly_pred)
+    poly_mae = mean_absolute_error(y, poly_pred)
+    
+    print("\n--- Model 2: Polynomial (Non-Linear) Regression (Degree 2) ---")
+    print(f"R2: {poly_r2:.4f} | MAE: {poly_mae:.4f} degrees")
+    
+    # Check if non-linear significantly outperforms linear
+    if poly_r2 > lr_r2 + 0.05:
+        print("-> CONCLUSION: The relationship is strongly non-linear/exponential.")
+    else:
+        print("-> CONCLUSION: The relationship is predominantly linear, with minimal exponential curvature.")
+        
+    # 3. Random Forest (Highly Non-Linear Trees)
+    rf = RandomForestRegressor(n_estimators=50, max_depth=5, random_state=42)
+    rf.fit(X, y)
+    rf_pred = rf.predict(X)
+    rf_r2 = r2_score(y, rf_pred)
+    rf_mae = mean_absolute_error(y, rf_pred)
+    
+    print("\n--- Model 3: Random Forest (Non-linear Tree Ensembles) ---")
+    print(f"R2: {rf_r2:.4f} | MAE: {rf_mae:.4f} degrees")
+    
+    # DPC/Moonsighting baseline simulated evaluation
+    # Moonsighting historically uses dynamic formulas incorporating Latitude and Season (TOY)
+    print("\n--- Feature Importance (Random Forest) ---")
+    importance = rf.feature_importances_
+    print(f"Latitude drives {importance[0]*100:.1f}% of the variance.")
+    print(f"Day of Year (Season) drives {importance[1]*100:.1f}% of the variance.")
+    
+if __name__ == '__main__':
+    analyze_prayer("fajr", "data/processed/fajr_angles.csv")
+    analyze_prayer("isha", "data/processed/isha_angles.csv")
--- a/src/analyze/ml_correlations.py
+++ b/src/analyze/ml_correlations.py
@ -0,0 +1,74 @@
+import pandas as pd
+import numpy as np
+import os
+import sys
+
+def evaluate_models(df, prayer):
+    print(f"\n{'='*50}\nML Feature Discovery: {prayer.upper()} ({len(df)} records)\n{'='*50}")
+    
+    if len(df) == 0:
+        return
+        
+    df['sin_toy'] = np.sin(2 * np.pi * df['day_of_year'] / 365.25)
+    df['cos_toy'] = np.cos(2 * np.pi * df['day_of_year'] / 365.25)
+    
+    # 1. Correlation Matrix
+    cols = ['lat', 'elevation_m', 'day_of_year', 'sin_toy', 'cos_toy', f'{prayer}_angle']
+    corr = df[cols].corr()
+    print("\n--- Pearson Correlation Matrix ---")
+    print(corr[f'{prayer}_angle'].sort_values(ascending=False).to_string())
+    
+    # 2. Linear vs Polynomial (Exponential/Curved) Evaluation
+    # Testing Latitude relationship
+    lat_val = df['lat'].values
+    angle_val = df[f'{prayer}_angle'].values
+    
+    # Linear Fit: y = ax + b
+    p_lin = np.polyfit(lat_val, angle_val, 1)
+    y_lin = np.polyval(p_lin, lat_val)
+    ss_res_lin = np.sum((angle_val - y_lin) ** 2)
+    ss_tot = np.sum((angle_val - np.mean(angle_val)) ** 2)
+    r2_lin = 1 - (ss_res_lin / ss_tot)
+    
+    # Polynomial Fit (Degree 3): y = ax^3 + bx^2 + cx + d
+    p_poly = np.polyfit(lat_val, angle_val, 3)
+    y_poly = np.polyval(p_poly, lat_val)
+    ss_res_poly = np.sum((angle_val - y_poly) ** 2)
+    r2_poly = 1 - (ss_res_poly / ss_tot)
+    
+    print("\n--- Linear vs Exponential (Polynomial) Latitude Dependency ---")
+    print(f"Linear Fit R²:     {r2_lin:.4f}")
+    print(f"Polynomial Fit R²: {r2_poly:.4f}")
+    if r2_poly > r2_lin + 0.05:
+        print(">> OBSERVATION: The relationship with Latitude is highly EXPONENTIAL/NON-LINEAR.")
+    else:
+        print(">> OBSERVATION: The relationship with Latitude is mostly LINEAR or very weak overall.")
+        
+    # 3. Monthly / Seasonal Aggregation
+    df['month'] = pd.to_datetime(df['date']).dt.month
+    monthly_mean = df.groupby('month')[f'{prayer}_angle'].mean()
+    print("\n--- Seasonal (Time of Year) Impact ---")
+    print("Mean Angle by Month:")
+    print(monthly_mean.to_string())
+    season_variance = monthly_mean.max() - monthly_mean.min()
+    print(f">> OBSERVATION: Seasonal fluctuation variance is {season_variance:.2f}°")
+
+def main():
+    base_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+    fajr_path = os.path.join(base_dir, "data", "processed", "fajr_angles.csv")
+    isha_path = os.path.join(base_dir, "data", "processed", "isha_angles.csv")
+    
+    try:
+        fajr_df = pd.read_csv(fajr_path)
+        evaluate_models(fajr_df, "fajr")
+    except Exception as e:
+        print(f"Fajr parse error: {e}")
+        
+    try:
+        isha_df = pd.read_csv(isha_path)
+        evaluate_models(isha_df, "isha")
+    except Exception as e:
+        print(f"Isha parse error: {e}")
+
+if __name__ == "__main__":
+    main()
--- a/src/analyze/ml_discover_formula.py
+++ b/src/analyze/ml_discover_formula.py
@ -0,0 +1,138 @@
+import pandas as pd
+import numpy as np
+from scipy.optimize import curve_fit
+import os
+
+# ---------------------------------------------------------
+# Candidate Biological/Astronomical Formula Definitions
+# x is a 2D array: x[0] = abs(lat), x[1] = elevation, x[2] = doy
+# ---------------------------------------------------------
+
+def eq1_static(x, a):
+    return np.full_like(x[0], a)
+
+def eq2_lin_lat(x, a, b):
+    return a + b * x[0]
+
+def eq3_quad_lat(x, a, b, c):
+    return a + b * x[0] + c * (x[0]**2)
+
+def eq4_lat_season(x, a, b, c, d):
+    # c is amplitude, d is phase shift
+    return a + b * x[0] + c * np.cos(2 * np.pi * (x[2] - d) / 365.25)
+
+def eq5_elev_lat_season(x, a, b, c, d, e):
+    return a + b * x[0] + c * x[1] + d * np.cos(2 * np.pi * (x[2] - e) / 365.25)
+
+def eq6_lat_dependent_season(x, a, b, c, d):
+    # Amplitude of season scales with latitude
+    return a + b * x[0] + c * x[0] * np.cos(2 * np.pi * (x[2] - d) / 365.25)
+
+def eq7_the_beautiful_match(x, a, b, c, d, e, f):
+    # The ultimate clean parametric equation
+    # Base + A*Lat + B*Lat^2 + C*Elev + D*Lat*Season + E*Season
+    lat = x[0]
+    elev = x[1]
+    season = np.cos(2 * np.pi * (x[2] - f) / 365.25)
+    return a + b * lat + c * (lat**2) + d * elev + e * lat * season
+
+# Map functions to their string representations for readable output
+FORMULAS = [
+    ("Static Baseline", eq1_static, ["Base"]),
+    ("Linear Latitude", eq2_lin_lat, ["Base", "Lat_Coeff"]),
+    ("Quadratic Latitude", eq3_quad_lat, ["Base", "Lat_Coeff", "Lat2_Coeff"]),
+    ("Latitude + Season", eq4_lat_season, ["Base", "Lat_Coeff", "Season_Amp", "Phase_Shift"]),
+    ("Elevation + Lat + Season", eq5_elev_lat_season, ["Base", "Lat_Coeff", "Elev_Coeff", "Season_Amp", "Phase_Shift"]),
+    ("Lat-Dependent Season", eq6_lat_dependent_season, ["Base", "Lat_Coeff", "Season_Lat_Scale", "Phase_Shift"]),
+    ("The Beautiful Match", eq7_the_beautiful_match, ["Base", "Lat_C", "Lat2_C", "Elev_C", "LatXSeason_C", "Phase_Shift"])
+]
+
+def analyze_formula(df, prayer):
+    print(f"\n{'='*70}\n[ {prayer.upper()} ALGORITHM DISCOVERY ] - {len(df)} Records\n{'='*70}")
+    
+    # Prepare input vectors
+    lat_arr = np.abs(df['lat'].values)
+    elev_arr = df['elevation_m'].values
+    doy_arr = df['day_of_year'].values
+    y = df[f'{prayer}_angle'].values
+    
+    x = np.vstack((lat_arr, elev_arr, doy_arr))
+    
+    best_mae = float('inf')
+    best_name = None
+    best_params = None
+    best_popt = None
+    best_func = None
+    
+    for name, func, param_names in FORMULAS:
+        # Initial guess (15.0 for base, 0 for others)
+        p0 = [15.0] + [0.0] * (len(param_names)-1)
+        # Bounding Phase_Shift safely if it exists (usually the last param if 'Phase_Shift' is there)
+        bounds = (-np.inf, np.inf)
+        
+        try:
+            popt, pcov = curve_fit(func, x, y, p0=p0, maxfev=10000)
+            
+            y_pred = func(x, *popt)
+            mae = np.mean(np.abs(y - y_pred))
+            rmse = np.sqrt(np.mean((y - y_pred)**2))
+            
+            print(f"--- {name} ---")
+            print(f"MAE: {mae:.4f}° | RMSE: {rmse:.4f}°")
+            params_str = ", ".join([f"{n}={v:.5f}" for n, v in zip(param_names, popt)])
+            print(f"Parameters: {params_str}")
+            print()
+            
+            if mae < best_mae:
+                best_mae = mae
+                best_name = name
+                best_params = params_str
+                best_popt = popt
+                best_func = func
+                
+        except Exception as e:
+            print(f"Failed to fit {name}: {e}")
+            
+    print(f"\n>> 🏆 WINNING FORMULA FOR {prayer.upper()}: {best_name}")
+    print(f">> BEST MAE: {best_mae:.4f}°")
+    print(f">> CONSTANTS: {best_params}")
+    
+    # Construct the human-readable clean mathematical equation
+    if best_name == "The Beautiful Match":
+        a, b, c, d, e, f = best_popt
+        eq_str = (
+            f"Angle = {a:.4f} "
+            f"{'+' if b>=0 else '-'} {abs(b):.5f} * abs(Lat) "
+            f"{'+' if c>=0 else '-'} {abs(c):.5f} * Lat² "
+            f"{'+' if d>=0 else '-'} {abs(d):.5f} * Elevation "
+            f"{'+' if e>=0 else '-'} {abs(e):.5f} * abs(Lat) * cos(2π * (DayOfYear - {f:.1f}) / 365)"
+        )
+        print(f"\n✨ THE CLEAN FORMULA ✨\n{eq_str}\n")
+    elif best_name == "Lat-Dependent Season":
+        a, b, c, d = best_popt
+        eq_str = (
+            f"Angle = {a:.4f} "
+            f"{'+' if b>=0 else '-'} {abs(b):.5f} * abs(Lat) "
+            f"{'+' if c>=0 else '-'} {abs(c):.5f} * abs(Lat) * cos(2π * (DayOfYear - {d:.1f}) / 365)"
+        )
+        print(f"\n✨ THE CLEAN FORMULA ✨\n{eq_str}\n")
+
+def main():
+    base_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+    fajr_path = os.path.join(base_dir, "data", "processed", "fajr_angles.csv")
+    isha_path = os.path.join(base_dir, "data", "processed", "isha_angles.csv")
+    
+    # 1. Clean outliers aggressively to find the mathematically purest underlying signal
+    # We will filter out angles < 10 and > 22 for absolute pure signal matching
+    if os.path.exists(fajr_path):
+        df = pd.read_csv(fajr_path)
+        df_clean = df[(df['fajr_angle'] >= 10.0) & (df['fajr_angle'] <= 20.0)]
+        analyze_formula(df_clean, "fajr")
+        
+    if os.path.exists(isha_path):
+        df = pd.read_csv(isha_path)
+        df_clean = df[(df['isha_angle'] >= 11.0) & (df['isha_angle'] <= 21.0)]
+        analyze_formula(df_clean, "isha")
+
+if __name__ == "__main__":
+    main()
--- a/src/analyze/praytimes.py
+++ b/src/analyze/praytimes.py
@ -0,0 +1 @@
+404: Not Found
--- a/src/analyze/run_ml_comparison.py
+++ b/src/analyze/run_ml_comparison.py
@ -0,0 +1,102 @@
+import pandas as pd
+import numpy as np
+import warnings
+from sklearn.linear_model import LinearRegression
+from sklearn.metrics import mean_absolute_error, r2_score
+from scipy.optimize import curve_fit
+
+warnings.filterwarnings("ignore")
+
+def exp_func(x, a, b, c):
+    return a * np.exp(b * x) + c
+
+def eval_algorithms(df, name):
+    print(f"\n==========================================")
+    print(f"       {name.upper()} ML PATTERN ANALYSIS       ")
+    print(f"==========================================")
+    print(f"Total verified observations: {len(df)}")
+    
+    # Check column name for angle
+    angle_col = f"{name.lower()}_angle"
+    if angle_col not in df.columns:
+        print(f"Error: {angle_col} not found in columns")
+        return
+        
+    df['doy'] = df['day_of_year']
+    df['lat_abs'] = df['lat'].abs()
+    
+    # Missing value cleaning just in case
+    df = df.dropna(subset=[angle_col, 'doy', 'lat_abs'])
+    
+    # 1. Seasonality Feature Engineering (Sin/Cos mapping of Time of Year)
+    df['doy_sin'] = np.sin(2 * np.pi * df['doy'] / 365)
+    df['doy_cos'] = np.cos(2 * np.pi * df['doy'] / 365)
+    
+    X_lat = df[['lat_abs']].values
+    X_all = df[['lat_abs', 'doy_sin', 'doy_cos']].values
+    y = df[angle_col].values
+    
+    # --- A. BASELINE FIXED ALGORITHM (ISNA/MWL style: 15° or 18°) ---
+    print("\n[A] STATIC ALGORITHM BENCHMARKS (15° vs 18°)")
+    mae_15 = mean_absolute_error(y, np.full_like(y, 15.0))
+    mae_18 = mean_absolute_error(y, np.full_like(y, 18.0))
+    print(f"   Fixed 15° Error (MAE): {mae_15:.3f}°")
+    print(f"   Fixed 18° Error (MAE): {mae_18:.3f}°")
+
+    # --- B. LATITUDE ONLY (Linear vs Exponential) ---
+    print("\n[B] LATITUDE PATTERNS (Equator distance vs Angle)")
+    
+    # Linear Latitude
+    lr_lat = LinearRegression().fit(X_lat, y)
+    y_pred_lat_lin = lr_lat.predict(X_lat)
+    print(f"   Linear Latitude Fit      : R2={r2_score(y, y_pred_lat_lin):.3f}, MAE={mean_absolute_error(y, y_pred_lat_lin):.3f}°")
+    print(f"   -> Coefficient: {lr_lat.coef_[0]:.4f} degrees per 1° latitude move")
+    
+    # Exponential Latitude
+    try:
+        # We model the depression angle dropping exponentially toward the poles
+        popt, _ = curve_fit(exp_func, df['lat_abs'], y, p0=[-1, 0.05, 18], maxfev=5000)
+        y_pred_lat_exp = exp_func(df['lat_abs'], *popt)
+        r2_exp = r2_score(y, y_pred_lat_exp)
+        mae_exp = mean_absolute_error(y, y_pred_lat_exp)
+        print(f"   Exponential Latitude Fit : R2={r2_exp:.3f}, MAE={mae_exp:.3f}°")
+    except Exception as e:
+        print(f"   Exponential Fit failed: {e}")
+
+    # --- C. TIME OF YEAR (SEASONALITY) IMPACT ---
+    print("\n[C] TIME OF YEAR (SEASONAL) CORRELATIONS")
+    lr_all = LinearRegression().fit(X_all, y)
+    y_pred_all = lr_all.predict(X_all)
+    print(f"   Combined Model (Lat + Seasonality): R2={r2_score(y, y_pred_all):.3f}, MAE={mean_absolute_error(y, y_pred_all):.3f}°")
+    
+    # Feature Importance
+    print(f"   -> Lat Importance   : {abs(lr_all.coef_[0]):.4f}")
+    print(f"   -> TOY (Season) Sine: {abs(lr_all.coef_[1]):.4f} amplitude")
+    
+    # --- D. DYNAMIC ALGORITHM COMPARISON ---
+    high_lat_mask = df['lat_abs'] > 40
+    if sum(high_lat_mask) > 0:
+        high_lat_mean = df[high_lat_mask][angle_col].mean()
+        low_lat_mean = df[~high_lat_mask][angle_col].mean()
+        print(f"\n[D] ALGORITHMIC COMPARISON")
+        print(f"   Mean empirical angle at >40° Lat  : {high_lat_mean:.2f}°")
+        print(f"   Mean empirical angle at <=40° Lat : {low_lat_mean:.2f}°")
+        shift = low_lat_mean - high_lat_mean
+        print(f"   Shift detected: {shift:.2f}° absolute reduction when moving towards poles.")
+        if shift > 0.5:
+            print("   => Matches Moonsighting/DPC dynamic logic (angles drop significantly at higher latitudes).")
+        else:
+            print("   => Does NOT strongly match Moonsighting dynamic logic.")
+
+if __name__ == '__main__':
+    try:
+        fajr_df = pd.read_csv('data/processed/fajr_angles.csv')
+        eval_algorithms(fajr_df, 'Fajr')
+    except Exception as e:
+        print("Missing Fajr dataset:", e)
+        
+    try:
+        isha_df = pd.read_csv('data/processed/isha_angles.csv')
+        eval_algorithms(isha_df, 'Isha')
+    except Exception as e:
+        print("Missing Isha dataset:", e)
--- a/src/bsrn_download.py
+++ b/src/bsrn_download.py
@ -0,0 +1,671 @@
+"""
+BSRN Data Downloader and Twilight Extractor
+
+Downloads BSRN irradiance data from:
+1. Zenodo 5-site dataset (quick, 58.5 MB parquet)
+2. CAELUS Zenodo dataset (54 stations, prioritizing 20-45°N)
+
+For each station-year, extracts twilight events:
+- Dawn: moment GHI first exceeds threshold after midnight
+- Dusk: last moment GHI exceeds threshold before midnight
+
+Computes solar depression angle at each event.
+Outputs CSV rows suitable for pray-calc fajr/isha angle analysis.
+"""
+
+import os
+import sys
+import math
+import zipfile
+import requests
+import pandas as pd
+import numpy as np
+from datetime import datetime, timezone, date, timedelta
+from pathlib import Path
+from io import BytesIO
+
+# ── paths ────────────────────────────────────────────────────────────────────
+BASE = Path("/Volumes/X9/Sites/acamarata/pray-calc-ml")
+RAW_BSRN = BASE / "data/raw/bsrn"
+RAW_SIGHTINGS = BASE / "data/raw/raw_sightings"
+RAW_BSRN.mkdir(parents=True, exist_ok=True)
+RAW_SIGHTINGS.mkdir(parents=True, exist_ok=True)
+
+# ── thresholds ────────────────────────────────────────────────────────────────
+# GHI threshold in W/m² to call "detectable irradiance"
+# 0.5 W/m² ~ starlight. 1 W/m² is typical first-light threshold.
+# We extract events at multiple thresholds.
+THRESHOLDS = [0.5, 1.0, 2.0, 5.0, 10.0]
+PRIMARY_THRESHOLD = 1.0  # used for the main sightings CSV
+
+# ── station priority list (CAELUS dataset) ────────────────────────────────────
+# Prioritize stations in 20-45°N, especially Muslim-majority/relevant regions
+PRIORITY_STATIONS = {
+    "sbo": {"name": "Sede Boqer",       "lat": 30.8597, "lon": 34.7794,  "country": "Israel/Negev"},
+    "tam": {"name": "Tamanrasset",      "lat": 22.7903, "lon": 5.5292,   "country": "Algeria"},
+    "xia": {"name": "Xianghe",          "lat": 39.754,  "lon": 116.962,  "country": "China"},
+    "gur": {"name": "Gurgaon",          "lat": 28.4249, "lon": 77.156,   "country": "India"},
+    "iza": {"name": "Izana",            "lat": 28.3093, "lon": -16.4993, "country": "Spain/Canary"},
+    "ish": {"name": "Ishigakijima",     "lat": 24.3367, "lon": 124.1644, "country": "Japan"},
+    "lln": {"name": "Lulin",            "lat": 23.4686, "lon": 120.8736, "country": "Taiwan"},
+    "dra": {"name": "Desert Rock",      "lat": 36.626,  "lon": -116.018, "country": "USA/Nevada"},
+    "e13": {"name": "SouthernGreatPlains","lat": 36.605, "lon": -97.485, "country": "USA/Oklahoma"},
+    "tat": {"name": "Tateno",           "lat": 36.0581, "lon": 140.1258, "country": "Japan"},
+    "fua": {"name": "Fukuoka",          "lat": 33.5822, "lon": 130.3764, "country": "Japan"},
+    "gcr": {"name": "GoodwinCreek",     "lat": 34.2547, "lon": -89.8729, "country": "USA/Mississippi"},
+    "bil": {"name": "Billings",         "lat": 36.605,  "lon": -97.516,  "country": "USA/Oklahoma"},
+}
+
+# Years available in CAELUS per station (approximate; we'll try 2007-2017)
+CAELUS_YEAR_RANGE = list(range(2007, 2018))
+
+# ── solar geometry ────────────────────────────────────────────────────────────
+
+def solar_declination(day_of_year: int) -> float:
+    """Solar declination in degrees (Spencer 1971)."""
+    B = 2 * math.pi * (day_of_year - 1) / 365
+    return (180 / math.pi) * (
+        0.006918
+        - 0.399912 * math.cos(B)
+        + 0.070257 * math.sin(B)
+        - 0.006758 * math.cos(2 * B)
+        + 0.000907 * math.sin(2 * B)
+        - 0.002697 * math.cos(3 * B)
+        + 0.00148  * math.sin(3 * B)
+    )
+
+
+def equation_of_time_minutes(day_of_year: int) -> float:
+    """Equation of time in minutes (Spencer 1971)."""
+    B = 2 * math.pi * (day_of_year - 1) / 365
+    return (180 / math.pi) * (
+        0.000075
+        + 0.001868 * math.cos(B)
+        - 0.032077 * math.sin(B)
+        - 0.014615 * math.cos(2 * B)
+        - 0.04089  * math.sin(2 * B)
+    ) * (180 / 15)  # convert to minutes
+
+
+def solar_hour_angle(utc_hour_frac: float, lon_deg: float, day_of_year: int) -> float:
+    """Solar hour angle in degrees at given UTC fractional hour and longitude."""
+    eot = equation_of_time_minutes(day_of_year)
+    # Local solar time in hours
+    lst = utc_hour_frac + lon_deg / 15.0 + eot / 60.0
+    # Hour angle (0 at solar noon)
+    return 15.0 * (lst - 12.0)
+
+
+def solar_elevation(lat_deg: float, lon_deg: float, utc_dt: datetime) -> float:
+    """Solar elevation angle in degrees (positive above horizon)."""
+    doy = utc_dt.timetuple().tm_yday
+    utc_frac = utc_dt.hour + utc_dt.minute / 60.0 + utc_dt.second / 3600.0
+    decl = math.radians(solar_declination(doy))
+    lat  = math.radians(lat_deg)
+    ha   = math.radians(solar_hour_angle(utc_frac, lon_deg, doy))
+    sin_elev = (
+        math.sin(lat) * math.sin(decl)
+        + math.cos(lat) * math.cos(decl) * math.cos(ha)
+    )
+    sin_elev = max(-1.0, min(1.0, sin_elev))
+    return math.degrees(math.asin(sin_elev))
+
+
+def solar_depression_angle(lat_deg: float, lon_deg: float, utc_dt: datetime) -> float:
+    """Solar depression angle = negative elevation (positive below horizon)."""
+    return -solar_elevation(lat_deg, lon_deg, utc_dt)
+
+
+# ── download helpers ──────────────────────────────────────────────────────────
+
+def download_file(url: str, dest: Path, label: str = "") -> bool:
+    """Download a file with progress. Returns True on success."""
+    if dest.exists() and dest.stat().st_size > 1000:
+        print(f"  [skip] {label or dest.name} already downloaded")
+        return True
+    try:
+        print(f"  [download] {label or dest.name} ...", end=" ", flush=True)
+        r = requests.get(url, timeout=120, stream=True)
+        r.raise_for_status()
+        total = int(r.headers.get("content-length", 0))
+        downloaded = 0
+        with open(dest, "wb") as f:
+            for chunk in r.iter_content(chunk_size=65536):
+                f.write(chunk)
+                downloaded += len(chunk)
+        mb = downloaded / 1_048_576
+        print(f"OK ({mb:.1f} MB)")
+        return True
+    except Exception as e:
+        print(f"FAILED: {e}")
+        if dest.exists():
+            dest.unlink()
+        return False
+
+
+# ── twilight extraction ───────────────────────────────────────────────────────
+
+def extract_twilight_events(
+    df: pd.DataFrame,
+    lat: float,
+    lon: float,
+    station_code: str,
+    station_name: str,
+    source: str,
+    threshold: float = PRIMARY_THRESHOLD,
+) -> list[dict]:
+    """
+    Given a dataframe with columns [timestamp, ghi] (UTC timestamps),
+    extract per-day dawn and dusk events.
+
+    Dawn: first minute where GHI >= threshold after a run of zeros
+    Dusk: last minute where GHI >= threshold before a run of zeros
+
+    Returns a list of dicts with keys:
+      date, event_type, utc_time, ghi_wm2, solar_depression_deg,
+      lat, lon, station_code, station_name, source, threshold_wm2
+    """
+    events = []
+
+    # Ensure UTC
+    if df["timestamp"].dt.tz is None:
+        df = df.copy()
+        df["timestamp"] = df["timestamp"].dt.tz_localize("UTC")
+    else:
+        df = df.copy()
+        df["timestamp"] = df["timestamp"].dt.tz_convert("UTC")
+
+    # Sort and drop nulls
+    df = df.dropna(subset=["ghi"]).sort_values("timestamp").reset_index(drop=True)
+    df["ghi"] = pd.to_numeric(df["ghi"], errors="coerce")
+    df = df.dropna(subset=["ghi"])
+
+    # Group by UTC date (using the date of the timestamp)
+    df["date"] = df["timestamp"].dt.date
+
+    for day, group in df.groupby("date"):
+        group = group.sort_values("timestamp").reset_index(drop=True)
+        ghi = group["ghi"].values
+        ts  = group["timestamp"].values  # numpy datetime64
+
+        # Find dawn: first index where GHI crosses threshold from below
+        # We require at least 3 consecutive sub-threshold readings before the crossing
+        dawn_idx = None
+        for i in range(3, len(ghi)):
+            if ghi[i] >= threshold and all(g < threshold for g in ghi[max(0, i-3):i]):
+                dawn_idx = i
+                break
+
+        # Find dusk: last index where GHI is >= threshold followed by sub-threshold
+        dusk_idx = None
+        for i in range(len(ghi) - 4, -1, -1):
+            if ghi[i] >= threshold and all(g < threshold for g in ghi[i+1:min(len(ghi), i+4)]):
+                dusk_idx = i
+                break
+
+        for idx, event_type in [(dawn_idx, "dawn"), (dusk_idx, "dusk")]:
+            if idx is None:
+                continue
+            ts_val = pd.Timestamp(ts[idx])
+            utc_dt = ts_val.to_pydatetime().replace(tzinfo=timezone.utc)
+            dep = solar_depression_angle(lat, lon, utc_dt)
+            ghi_val = float(ghi[idx])
+
+            # Only record events where sun is clearly below horizon
+            # (eliminates cloudy days where GHI is suppressed during daytime)
+            if dep < 0:  # sun above horizon — skip, not a real twilight event
+                continue
+            if dep > 25:  # sun more than 25° below — likely data noise
+                continue
+
+            events.append({
+                "date":                  str(day),
+                "event_type":            event_type,
+                "utc_time":              utc_dt.strftime("%H:%M"),
+                "ghi_wm2":               round(ghi_val, 2),
+                "solar_depression_deg":  round(dep, 4),
+                "lat":                   lat,
+                "lon":                   lon,
+                "station_code":          station_code,
+                "station_name":          station_name,
+                "source":                source,
+                "threshold_wm2":         threshold,
+            })
+
+    return events
+
+
+# ── dataset 1: BSRN 5-site parquet ───────────────────────────────────────────
+
+FIVE_SITE_URL = "https://zenodo.org/records/10593079/files/bsrn_valid_five_sites.parquet"
+FIVE_SITE_PATH = RAW_BSRN / "bsrn_valid_five_sites.parquet"
+
+FIVE_SITE_META = {
+    "mnm": {"name": "Minamitorishima", "lat": 24.2833, "lon": 153.9667},
+    "asp": {"name": "Alice Springs",   "lat": -23.7984, "lon": 133.8880},
+    "car": {"name": "Carpentras",      "lat": 44.0833, "lon": 5.0583},
+    "bon": {"name": "Bondville",       "lat": 40.0667, "lon": -88.3667},
+    "son": {"name": "Sonnblick",       "lat": 47.0542, "lon": 12.9578},
+}
+
+
+def process_five_site() -> list[dict]:
+    """Download and process the BSRN 5-site parquet dataset."""
+    print("\n=== BSRN 5-site dataset (Zenodo 10593079) ===")
+    ok = download_file(FIVE_SITE_URL, FIVE_SITE_PATH, "bsrn_valid_five_sites.parquet")
+    if not ok:
+        print("  ERROR: could not download 5-site dataset")
+        return []
+
+    print("  Loading parquet ...", end=" ", flush=True)
+    df_all = pd.read_parquet(FIVE_SITE_PATH)
+    print(f"OK — {len(df_all):,} rows, columns: {list(df_all.columns)}")
+
+    all_events = []
+
+    # Detect timestamp column
+    ts_col = None
+    for c in ["timestamp", "time", "datetime", "date_time", "Timestamp", "TIME"]:
+        if c in df_all.columns:
+            ts_col = c
+            break
+    if ts_col is None:
+        # Use first datetime-like column
+        for c in df_all.columns:
+            if pd.api.types.is_datetime64_any_dtype(df_all[c]):
+                ts_col = c
+                break
+    if ts_col is None:
+        print(f"  WARNING: cannot find timestamp column. Columns: {list(df_all.columns)[:20]}")
+        return []
+
+    # Detect GHI column
+    ghi_col = None
+    for c in ["ghi", "GHI", "global_horizontal_irradiance", "SWD", "swd", "global"]:
+        if c in df_all.columns:
+            ghi_col = c
+            break
+    if ghi_col is None:
+        print(f"  WARNING: cannot find GHI column. Columns: {list(df_all.columns)[:20]}")
+        return []
+
+    # Detect station column
+    stn_col = None
+    for c in ["station", "sta", "site", "STATION"]:
+        if c in df_all.columns:
+            stn_col = c
+            break
+
+    print(f"  Using ts_col='{ts_col}', ghi_col='{ghi_col}', station_col='{stn_col}'")
+
+    if stn_col:
+        groups = {stn: sub for stn, sub in df_all.groupby(stn_col)}
+    else:
+        # Single station file — infer from filename or use first meta entry
+        groups = {"unknown": df_all}
+
+    for stn_code, sub in groups.items():
+        stn_key = str(stn_code).lower()
+        meta = FIVE_SITE_META.get(stn_key, None)
+        if meta is None:
+            # Try to find lat/lon in data
+            if "latitude" in sub.columns and "longitude" in sub.columns:
+                lat = float(sub["latitude"].iloc[0])
+                lon = float(sub["longitude"].iloc[0])
+                name = stn_key
+            else:
+                print(f"  SKIP {stn_key}: no metadata")
+                continue
+        else:
+            lat, lon, name = meta["lat"], meta["lon"], meta["name"]
+
+        work = sub[[ts_col, ghi_col]].copy()
+        work.columns = ["timestamp", "ghi"]
+        work["timestamp"] = pd.to_datetime(work["timestamp"], utc=True, errors="coerce")
+        work = work.dropna(subset=["timestamp"])
+
+        print(f"  Processing {name} ({stn_key}): {len(work):,} rows ...", end=" ", flush=True)
+        evts = extract_twilight_events(work, lat, lon, stn_key, name, "bsrn_5site")
+        print(f"{len(evts)} events")
+        all_events.extend(evts)
+
+    return all_events
+
+
+# ── dataset 2: CAELUS ZIP files ───────────────────────────────────────────────
+
+CAELUS_BASE_URL = "https://zenodo.org/api/records/7897639/files/{filename}/content"
+
+
+def caelus_filename(station_code: str, year: int) -> str:
+    return f"{station_code}_bsrn_{year}.zip"
+
+
+def list_caelus_available() -> dict[str, list[int]]:
+    """
+    Probe Zenodo API to get available files, return dict of station -> [years].
+    """
+    print("\n  Querying CAELUS file list from Zenodo API ...", end=" ", flush=True)
+    try:
+        r = requests.get("https://zenodo.org/api/records/7897639", timeout=30)
+        r.raise_for_status()
+        data = r.json()
+        available: dict[str, list[int]] = {}
+        for f in data.get("files", []):
+            fname = f.get("key", "")
+            if fname.endswith(".zip") and "_bsrn_" in fname:
+                parts = fname.replace(".zip", "").split("_bsrn_")
+                if len(parts) == 2:
+                    code = parts[0]
+                    try:
+                        year = int(parts[1])
+                        available.setdefault(code, []).append(year)
+                    except ValueError:
+                        pass
+        print(f"found {len(available)} stations")
+        return available
+    except Exception as e:
+        print(f"FAILED: {e}")
+        return {}
+
+
+def process_caelus_zip(zip_path: Path, station_code: str, meta: dict, year: int) -> list[dict]:
+    """Extract parquet from ZIP, parse, return events."""
+    events = []
+    try:
+        with zipfile.ZipFile(zip_path, "r") as zf:
+            parquet_files = [n for n in zf.namelist() if n.endswith(".parquet")]
+            if not parquet_files:
+                print(f"    WARNING: no parquet in {zip_path.name}")
+                return []
+            for pf in parquet_files:
+                with zf.open(pf) as f:
+                    df = pd.read_parquet(BytesIO(f.read()))
+
+                # Detect columns
+                ts_col = None
+                for c in ["timestamp", "time", "datetime", "Timestamp", "TIME"]:
+                    if c in df.columns:
+                        ts_col = c
+                        break
+                if ts_col is None:
+                    for c in df.columns:
+                        if pd.api.types.is_datetime64_any_dtype(df[c]):
+                            ts_col = c
+                            break
+                if ts_col is None:
+                    print(f"    WARNING: no timestamp in {pf}")
+                    continue
+
+                ghi_col = None
+                for c in ["ghi", "GHI", "SWD", "swd", "global_horizontal_irradiance", "global"]:
+                    if c in df.columns:
+                        ghi_col = c
+                        break
+                if ghi_col is None:
+                    # Use diffuse if available
+                    for c in ["dhi", "DHI", "diffuse"]:
+                        if c in df.columns:
+                            ghi_col = c
+                            break
+                if ghi_col is None:
+                    print(f"    WARNING: no GHI column in {pf}. cols={list(df.columns)[:15]}")
+                    continue
+
+                work = df[[ts_col, ghi_col]].copy()
+                work.columns = ["timestamp", "ghi"]
+                work["timestamp"] = pd.to_datetime(work["timestamp"], utc=True, errors="coerce")
+                work = work.dropna(subset=["timestamp"])
+
+                evts = extract_twilight_events(
+                    work,
+                    meta["lat"], meta["lon"],
+                    station_code, meta["name"],
+                    f"caelus_{year}",
+                )
+                events.extend(evts)
+
+    except zipfile.BadZipFile as e:
+        print(f"    ERROR: bad zip {zip_path.name}: {e}")
+    except Exception as e:
+        print(f"    ERROR processing {zip_path.name}: {e}")
+
+    return events
+
+
+def process_caelus(max_stations: int = 8, max_years_per_station: int = 5) -> list[dict]:
+    """Download and process CAELUS priority stations."""
+    print("\n=== CAELUS dataset (Zenodo 7897639) ===")
+    available = list_caelus_available()
+
+    all_events = []
+
+    processed_count = 0
+    for code, meta in PRIORITY_STATIONS.items():
+        if processed_count >= max_stations:
+            break
+
+        if code not in available:
+            print(f"  SKIP {code} ({meta['name']}): not in CAELUS dataset")
+            continue
+
+        years = sorted(available[code])
+        years_to_use = years[:max_years_per_station]
+        print(f"\n  Station: {meta['name']} ({code}) | lat={meta['lat']} lon={meta['lon']} | years: {years_to_use}")
+
+        station_events = []
+        for year in years_to_use:
+            fname = caelus_filename(code, year)
+            url = CAELUS_BASE_URL.format(filename=fname)
+            dest = RAW_BSRN / fname
+
+            ok = download_file(url, dest, fname)
+            if not ok:
+                continue
+
+            print(f"    Processing {fname} ...", end=" ", flush=True)
+            evts = process_caelus_zip(dest, code, meta, year)
+            print(f"{len(evts)} events")
+            station_events.extend(evts)
+
+        print(f"  Total events for {code}: {len(station_events)}")
+        all_events.extend(station_events)
+        processed_count += 1
+
+    return all_events
+
+
+# ── SURFRAD download ──────────────────────────────────────────────────────────
+
+SURFRAD_STATIONS = {
+    "dra": {
+        "name": "Desert Rock",
+        "lat": 36.626, "lon": -116.018,
+        "ftp_dir": "DRA",
+    },
+    "tbl": {
+        "name": "Table Mountain",
+        "lat": 40.125, "lon": -105.237,
+        "ftp_dir": "TBL",
+    },
+}
+SURFRAD_BASE = "https://gml.noaa.gov/aftp/data/radiation/surfrad"
+
+
+def parse_surfrad_file(content: bytes, station_meta: dict) -> pd.DataFrame:
+    """
+    Parse SURFRAD dat file format.
+    Returns dataframe with [timestamp, ghi] columns.
+    """
+    lines = content.decode("utf-8", errors="replace").splitlines()
+
+    # Skip header (first line is site info, second is column header)
+    # Format: year jday hour min dt zen dw_solar dw_solar_stdev ...
+    records = []
+    for line in lines[2:]:
+        parts = line.split()
+        if len(parts) < 9:
+            continue
+        try:
+            yr   = int(parts[0])
+            jday = int(parts[1])
+            hr   = int(parts[2])
+            mn   = int(parts[3])
+            # GHI is column index 6 (dw_solar = downwelling shortwave)
+            ghi_raw = float(parts[6])
+
+            # Build datetime
+            dt = datetime(yr, 1, 1, hr, mn, tzinfo=timezone.utc) + timedelta(days=jday - 1)
+
+            # SURFRAD uses -9999 for missing
+            ghi = None if ghi_raw < -900 else max(0.0, ghi_raw)
+            records.append({"timestamp": dt, "ghi": ghi})
+        except (ValueError, IndexError):
+            continue
+
+    return pd.DataFrame(records)
+
+
+def process_surfrad(years: list[int] | None = None) -> list[dict]:
+    """Download and process SURFRAD Desert Rock and Table Mountain."""
+    print("\n=== SURFRAD dataset (NOAA GML FTP) ===")
+    if years is None:
+        years = [2018, 2019, 2020]  # 3 recent years
+
+    all_events = []
+
+    for code, meta in SURFRAD_STATIONS.items():
+        station_dir = meta["ftp_dir"]
+        print(f"\n  Station: {meta['name']} ({code}) | lat={meta['lat']} lon={meta['lon']}")
+
+        for year in years:
+            print(f"    Year {year}:")
+            station_events = []
+
+            # SURFRAD stores one file per day: DRA/YYYY/draYYYjjj.dat
+            # where jjj = day of year (001-365)
+            # We'll try to get the full year index
+            year_dir_url = f"{SURFRAD_BASE}/{station_dir}/{year}/"
+            try:
+                r = requests.get(year_dir_url, timeout=30)
+                if r.status_code != 200:
+                    print(f"      Cannot access {year_dir_url}: {r.status_code}")
+                    continue
+                # Parse file listing (Apache directory index)
+                import re
+                files = re.findall(rf'{code.lower()}\d{{5}}\.dat', r.text)
+                files = sorted(set(files))
+                print(f"      Found {len(files)} daily files")
+
+                dest_dir = RAW_BSRN / "surfrad" / station_dir / str(year)
+                dest_dir.mkdir(parents=True, exist_ok=True)
+
+                day_records = []
+                for fname in files:
+                    dest = dest_dir / fname
+                    file_url = f"{year_dir_url}{fname}"
+                    ok = download_file(file_url, dest, fname)
+                    if not ok:
+                        continue
+                    with open(dest, "rb") as f:
+                        content = f.read()
+                    day_df = parse_surfrad_file(content, meta)
+                    if not day_df.empty:
+                        day_records.append(day_df)
+
+                if day_records:
+                    df_year = pd.concat(day_records, ignore_index=True)
+                    print(f"      {len(df_year):,} 1-min records loaded", end=" ")
+                    evts = extract_twilight_events(
+                        df_year, meta["lat"], meta["lon"],
+                        code, meta["name"], f"surfrad_{year}",
+                    )
+                    print(f"→ {len(evts)} twilight events")
+                    station_events.extend(evts)
+
+            except Exception as e:
+                print(f"      ERROR: {e}")
+                continue
+
+        all_events.extend(station_events)
+
+    return all_events
+
+
+# ── save results ──────────────────────────────────────────────────────────────
+
+def save_sightings(events: list[dict], filename: str) -> Path:
+    """Save events list to CSV."""
+    if not events:
+        print(f"  WARNING: no events to save to {filename}")
+        return RAW_SIGHTINGS / filename
+    df = pd.DataFrame(events)
+    df = df.sort_values(["station_code", "date", "event_type"]).reset_index(drop=True)
+    out = RAW_SIGHTINGS / filename
+    df.to_csv(out, index=False)
+    print(f"  Saved {len(df)} events → {out}")
+    return out
+
+
+def print_summary(events: list[dict], label: str):
+    if not events:
+        print(f"\n{label}: no events")
+        return
+    df = pd.DataFrame(events)
+    print(f"\n{'='*60}")
+    print(f"SUMMARY: {label}")
+    print(f"  Total events:    {len(df)}")
+    print(f"  Dawn events:     {(df['event_type']=='dawn').sum()}")
+    print(f"  Dusk events:     {(df['event_type']=='dusk').sum()}")
+    print(f"  Stations:        {df['station_code'].nunique()} — {sorted(df['station_code'].unique())}")
+    print(f"  Date range:      {df['date'].min()} to {df['date'].max()}")
+    dep = df["solar_depression_deg"]
+    print(f"  Depression angle: mean={dep.mean():.2f}° std={dep.std():.2f}° "
+          f"min={dep.min():.2f}° max={dep.max():.2f}°")
+    if "event_type" in df.columns:
+        for evt_type in ["dawn", "dusk"]:
+            sub = df[df["event_type"] == evt_type]
+            if not sub.empty:
+                d = sub["solar_depression_deg"]
+                print(f"  {evt_type.upper()} depression: mean={d.mean():.2f}° "
+                      f"p10={d.quantile(0.1):.2f}° p90={d.quantile(0.9):.2f}°")
+    print(f"{'='*60}")
+
+
+# ── main ──────────────────────────────────────────────────────────────────────
+
+def main():
+    print("BSRN Twilight Extractor")
+    print(f"Output dir: {RAW_SIGHTINGS}")
+    print(f"Raw data:   {RAW_BSRN}")
+    print(f"GHI threshold: {PRIMARY_THRESHOLD} W/m²\n")
+
+    all_events = []
+
+    # 1. BSRN 5-site (quick, 58.5 MB)
+    five_site_events = process_five_site()
+    save_sightings(five_site_events, "bsrn_5site_twilight.csv")
+    print_summary(five_site_events, "BSRN 5-site dataset")
+    all_events.extend(five_site_events)
+
+    # 2. CAELUS (priority stations, up to 8 stations × 5 years)
+    caelus_events = process_caelus(max_stations=8, max_years_per_station=5)
+    save_sightings(caelus_events, "bsrn_caelus_twilight.csv")
+    print_summary(caelus_events, "CAELUS dataset")
+    all_events.extend(caelus_events)
+
+    # 3. SURFRAD (Desert Rock + Table Mountain)
+    surfrad_events = process_surfrad(years=[2018, 2019, 2020])
+    save_sightings(surfrad_events, "surfrad_twilight.csv")
+    print_summary(surfrad_events, "SURFRAD dataset")
+    all_events.extend(surfrad_events)
+
+    # Combined
+    save_sightings(all_events, "bsrn_all_twilight.csv")
+    print_summary(all_events, "ALL SOURCES COMBINED")
+
+    print("\nDone.")
+
+
+if __name__ == "__main__":
+    main()
--- a/src/bsrn_process.py
+++ b/src/bsrn_process.py
@ -0,0 +1,563 @@
+"""
+BSRN Twilight Extractor — corrected for actual file formats.
+
+Dataset formats:
+1. BSRN 5-site parquet (Zenodo 10593079):
+   - Multi-index: (times_utc, site)
+   - Columns: climate, longitude, latitude, sza, eth, ghicda, ghics, difcs, ghi, dif, sky_type
+   - 'site' values like 'asp/bsrn', 'bon/bsrn', etc.
+
+2. CAELUS ZIPs (Zenodo 7897639):
+   - Each ZIP contains one CSV named {code}_bsrn_{year}.csv
+   - Columns: times_utc, sza, eth, ghi, dif, ghics, ghicda, sky_type
+   - sza = solar zenith angle (degrees). Depression = sza - 90 when sza > 90.
+
+For twilight extraction we use BOTH ghi AND dif (diffuse horizontal irradiance).
+Diffuse is more sensitive at twilight — it captures scattered skylight before direct beam arrives.
+
+The `sza` column is authoritative for solar geometry — no need to recompute.
+Depression angle = sza - 90° (valid when sza > 90°, i.e., sun below horizon).
+
+Thresholds extracted (multi-threshold approach):
+- 0.5, 1, 2, 5, 10 W/m² for both ghi and dif
+
+Primary threshold for the main output: dif >= 1.0 W/m² (diffuse is more reliable at twilight).
+"""
+
+import os
+import math
+import zipfile
+import requests
+import re
+import pandas as pd
+import numpy as np
+from datetime import datetime, timezone, timedelta
+from pathlib import Path
+from io import BytesIO
+
+# ── paths ─────────────────────────────────────────────────────────────────────
+BASE = Path("/Volumes/X9/Sites/acamarata/pray-calc-ml")
+RAW_BSRN = BASE / "data/raw/bsrn"
+RAW_SIGHTINGS = BASE / "data/raw/raw_sightings"
+RAW_BSRN.mkdir(parents=True, exist_ok=True)
+RAW_SIGHTINGS.mkdir(parents=True, exist_ok=True)
+
+# ── thresholds ────────────────────────────────────────────────────────────────
+THRESHOLDS = [0.5, 1.0, 2.0, 5.0, 10.0]
+PRIMARY_THRESHOLD = 1.0  # W/m²
+
+# ── station metadata ──────────────────────────────────────────────────────────
+# For CAELUS: station codes and their coordinates
+CAELUS_STATION_META = {
+    "sbo": {"name": "Sede Boqer",         "lat": 30.8597, "lon": 34.7794,  "region": "Negev/Israel"},
+    "tam": {"name": "Tamanrasset",         "lat": 22.7903, "lon": 5.5292,   "region": "Algeria"},
+    "xia": {"name": "Xianghe",             "lat": 39.754,  "lon": 116.962,  "region": "China"},
+    "iza": {"name": "Izana",               "lat": 28.3093, "lon": -16.4993, "region": "Canary Islands/Spain"},
+    "ish": {"name": "Ishigakijima",        "lat": 24.3367, "lon": 124.1644, "region": "Japan"},
+    "dra": {"name": "Desert Rock",         "lat": 36.626,  "lon": -116.018, "region": "Nevada/USA"},
+    "e13": {"name": "SouthernGreatPlains", "lat": 36.605,  "lon": -97.485,  "region": "Oklahoma/USA"},
+    "tat": {"name": "Tateno",              "lat": 36.0581, "lon": 140.1258, "region": "Japan"},
+    "fua": {"name": "Fukuoka",             "lat": 33.5822, "lon": 130.3764, "region": "Japan"},
+    "gcr": {"name": "GoodwinCreek",        "lat": 34.2547, "lon": -89.8729, "region": "Mississippi/USA"},
+    "bil": {"name": "Billings/ARM",        "lat": 36.605,  "lon": -97.516,  "region": "Oklahoma/USA"},
+    "abs": {"name": "Abashiri",            "lat": 44.0178, "lon": 144.2797, "region": "Japan"},
+    "lyu": {"name": "Lanyu Island",        "lat": 22.037,  "lon": 121.5583, "region": "Taiwan"},
+    "lln": {"name": "Lulin",               "lat": 23.4686, "lon": 120.8736, "region": "Taiwan"},
+    "cnr": {"name": "Cener",               "lat": 42.816,  "lon": -1.601,   "region": "Spain/Navarra"},
+    "flo": {"name": "Florina",             "lat": 40.78,   "lon": 21.38,    "region": "Greece"},
+    "car": {"name": "Carpentras",          "lat": 44.0833, "lon": 5.0583,   "region": "France"},
+}
+
+# Five-site parquet station metadata (lat/lon stored in the file itself)
+FIVE_SITE_MAP = {
+    "mnm": {"name": "Minamitorishima", "region": "Japan"},
+    "asp": {"name": "Alice Springs",   "region": "Australia"},
+    "car": {"name": "Carpentras",      "region": "France"},
+    "bon": {"name": "Bondville",       "region": "USA/Illinois"},
+    "son": {"name": "Sonnblick",       "region": "Austria"},
+}
+
+CAELUS_BASE_URL = "https://zenodo.org/api/records/7897639/files/{filename}/content"
+FIVE_SITE_URL   = "https://zenodo.org/records/10593079/files/bsrn_valid_five_sites.parquet"
+FIVE_SITE_PATH  = RAW_BSRN / "bsrn_valid_five_sites.parquet"
+
+# ── download helper ───────────────────────────────────────────────────────────
+
+def download_file(url: str, dest: Path, label: str = "") -> bool:
+    if dest.exists() and dest.stat().st_size > 1000:
+        print(f"  [cached] {label or dest.name}")
+        return True
+    try:
+        print(f"  [dl] {label or dest.name} ...", end=" ", flush=True)
+        r = requests.get(url, timeout=180, stream=True)
+        r.raise_for_status()
+        downloaded = 0
+        with open(dest, "wb") as f:
+            for chunk in r.iter_content(chunk_size=131072):
+                f.write(chunk)
+                downloaded += len(chunk)
+        print(f"OK ({downloaded/1_048_576:.1f} MB)")
+        return True
+    except Exception as e:
+        print(f"FAILED: {e}")
+        if dest.exists():
+            dest.unlink()
+        return False
+
+
+# ── twilight extraction ───────────────────────────────────────────────────────
+
+def extract_events_from_df(
+    df: pd.DataFrame,
+    lat: float,
+    lon: float,
+    station_code: str,
+    station_name: str,
+    source: str,
+) -> list[dict]:
+    """
+    df must have columns: times_utc (datetime64[ns] UTC), sza (float), ghi (float), dif (float).
+
+    For each day:
+      dawn = first minute where dif (or ghi if dif is null) crosses PRIMARY_THRESHOLD
+             AND sza > 90 (sun still below horizon)
+      dusk = last minute where dif (or ghi) >= PRIMARY_THRESHOLD
+             AND sza > 90
+
+    Records depression angle = sza - 90.0 at the event moment.
+    Also records raw threshold-by-threshold data for all THRESHOLDS.
+    """
+    events = []
+
+    df = df.copy()
+    df["times_utc"] = pd.to_datetime(df["times_utc"], utc=True, errors="coerce")
+    df = df.dropna(subset=["times_utc"]).sort_values("times_utc").reset_index(drop=True)
+
+    # Build irradiance signal: prefer dif (diffuse) which is more sensitive at twilight
+    # Fall back to ghi if dif is all NaN
+    if "dif" in df.columns:
+        df["irr"] = pd.to_numeric(df["dif"], errors="coerce")
+    else:
+        df["irr"] = float("nan")
+
+    if "ghi" in df.columns:
+        ghi_vals = pd.to_numeric(df["ghi"], errors="coerce")
+        # Fill irr NaN with ghi
+        df["irr"] = df["irr"].combine_first(ghi_vals)
+
+    df["sza"] = pd.to_numeric(df["sza"], errors="coerce")
+    df["date"] = df["times_utc"].dt.date
+
+    for day, grp in df.groupby("date"):
+        grp = grp.sort_values("times_utc").reset_index(drop=True)
+
+        irr  = grp["irr"].values
+        sza  = grp["sza"].values
+        ts   = grp["times_utc"].values
+
+        n = len(irr)
+        if n < 10:
+            continue
+
+        # For each threshold, extract dawn/dusk
+        for threshold in THRESHOLDS:
+            # Dawn: first index where irr >= threshold AND sza > 90
+            # Require at least 2 consecutive sub-threshold readings immediately before
+            dawn_idx = None
+            for i in range(2, n):
+                if (
+                    not np.isnan(irr[i])
+                    and irr[i] >= threshold
+                    and not np.isnan(sza[i])
+                    and sza[i] > 90.0
+                ):
+                    # Confirm at least 2 prior readings were sub-threshold
+                    prev_ok = all(
+                        (np.isnan(v) or v < threshold)
+                        for v in irr[max(0, i-3):i]
+                    )
+                    if prev_ok:
+                        dawn_idx = i
+                        break
+
+            # Dusk: last index where irr >= threshold AND sza > 90
+            # Require at least 2 consecutive sub-threshold readings immediately after
+            dusk_idx = None
+            for i in range(n - 3, -1, -1):
+                if (
+                    not np.isnan(irr[i])
+                    and irr[i] >= threshold
+                    and not np.isnan(sza[i])
+                    and sza[i] > 90.0
+                ):
+                    after_ok = all(
+                        (np.isnan(v) or v < threshold)
+                        for v in irr[i+1:min(n, i+4)]
+                    )
+                    if after_ok:
+                        dusk_idx = i
+                        break
+
+            for idx, event_type in [(dawn_idx, "dawn"), (dusk_idx, "dusk")]:
+                if idx is None:
+                    continue
+
+                sza_val = float(sza[idx])
+                # sza must be > 90 (sun below horizon) — extra safety check
+                if sza_val <= 90.0:
+                    continue
+                # Depression angle in degrees
+                dep = round(sza_val - 90.0, 4)
+                # Only record plausible twilight depression (0.5° to 20°)
+                if dep < 0.3 or dep > 20.0:
+                    continue
+
+                utc_ts = pd.Timestamp(ts[idx])
+                irr_val = float(irr[idx])
+
+                # Also grab ghi/dif individually at this moment
+                ghi_at = float(grp["ghi"].iloc[idx]) if "ghi" in grp.columns else float("nan")
+                dif_at  = float(grp["dif"].iloc[idx])  if "dif" in grp.columns else float("nan")
+
+                events.append({
+                    "date":                  str(day),
+                    "event_type":            event_type,
+                    "utc_time":              utc_ts.strftime("%H:%M:%S"),
+                    "sza_deg":               round(sza_val, 4),
+                    "solar_depression_deg":  dep,
+                    "irr_wm2":               round(irr_val, 2),
+                    "ghi_wm2":               round(ghi_at, 2) if not math.isnan(ghi_at) else None,
+                    "dif_wm2":               round(dif_at, 2)  if not math.isnan(dif_at) else None,
+                    "threshold_wm2":         threshold,
+                    "lat":                   lat,
+                    "lon":                   lon,
+                    "station_code":          station_code,
+                    "station_name":          station_name,
+                    "source":                source,
+                })
+
+    return events
+
+
+# ── 5-site dataset ────────────────────────────────────────────────────────────
+
+def process_five_site() -> list[dict]:
+    print("\n=== BSRN 5-site parquet (Zenodo 10593079) ===")
+    ok = download_file(FIVE_SITE_URL, FIVE_SITE_PATH, "bsrn_valid_five_sites.parquet")
+    if not ok:
+        return []
+
+    print("  Loading ...", end=" ", flush=True)
+    df_raw = pd.read_parquet(FIVE_SITE_PATH)
+    df_raw = df_raw.reset_index()  # brings times_utc and site into columns
+    print(f"OK — {len(df_raw):,} rows")
+    print(f"  Columns: {list(df_raw.columns)}")
+
+    all_events = []
+    for site_val, grp in df_raw.groupby("site"):
+        # site_val is like 'asp/bsrn'
+        code = str(site_val).split("/")[0].lower()
+        meta = FIVE_SITE_MAP.get(code, {"name": code, "region": "unknown"})
+
+        # Get lat/lon from the data itself
+        lat = float(grp["latitude"].iloc[0])
+        lon = float(grp["longitude"].iloc[0])
+
+        # Rename columns to standard
+        sub = grp[["times_utc", "sza", "ghi", "dif"]].copy()
+
+        print(f"  {meta['name']} ({code}) lat={lat:.3f} lon={lon:.3f}: "
+              f"{len(sub):,} rows ...", end=" ", flush=True)
+
+        evts = extract_events_from_df(sub, lat, lon, code, meta["name"], "bsrn_5site")
+        print(f"{len(evts)} events")
+        all_events.extend(evts)
+
+    return all_events
+
+
+# ── CAELUS dataset ─────────────────────────────────────────────────────────────
+
+def list_caelus_files() -> dict[str, list[int]]:
+    """Return dict of station_code -> sorted list of available years."""
+    print("  Querying Zenodo API ...", end=" ", flush=True)
+    try:
+        r = requests.get("https://zenodo.org/api/records/7897639", timeout=30)
+        r.raise_for_status()
+        data = r.json()
+        available: dict[str, list[int]] = {}
+        for f in data.get("files", []):
+            fname = f.get("key", "")
+            if fname.endswith(".zip") and "_bsrn_" in fname:
+                parts = fname.replace(".zip", "").split("_bsrn_")
+                if len(parts) == 2:
+                    code = parts[0]
+                    try:
+                        year = int(parts[1])
+                        available.setdefault(code, []).append(year)
+                    except ValueError:
+                        pass
+        for k in available:
+            available[k] = sorted(available[k])
+        print(f"{len(available)} stations found")
+        return available
+    except Exception as e:
+        print(f"FAILED: {e}")
+        return {}
+
+
+def process_caelus_zip(zip_path: Path, station_code: str, meta: dict, year: int) -> list[dict]:
+    events = []
+    try:
+        with zipfile.ZipFile(zip_path, "r") as zf:
+            csv_files = [n for n in zf.namelist() if n.endswith(".csv")]
+            if not csv_files:
+                print(f"    WARNING: no CSV in {zip_path.name}")
+                return []
+            with zf.open(csv_files[0]) as f:
+                df = pd.read_csv(f)
+
+        # Validate expected columns
+        required = {"times_utc", "sza", "ghi"}
+        missing = required - set(df.columns)
+        if missing:
+            print(f"    WARNING: missing columns {missing} in {zip_path.name}")
+            return []
+
+        evts = extract_events_from_df(
+            df, meta["lat"], meta["lon"],
+            station_code, meta["name"], f"caelus_{year}",
+        )
+        return evts
+
+    except zipfile.BadZipFile:
+        print(f"    ERROR: corrupt zip {zip_path.name}")
+        return []
+    except Exception as e:
+        print(f"    ERROR {zip_path.name}: {e}")
+        return []
+
+
+def process_caelus(priority_codes: list[str] | None = None, max_years: int = 5) -> list[dict]:
+    print("\n=== CAELUS dataset (Zenodo 7897639) ===")
+    available = list_caelus_files()
+    if not available:
+        return []
+
+    if priority_codes is None:
+        # All known stations, sorted by relevance
+        priority_codes = list(CAELUS_STATION_META.keys())
+
+    all_events = []
+
+    for code in priority_codes:
+        if code not in available:
+            print(f"  SKIP {code}: not in CAELUS")
+            continue
+
+        meta = CAELUS_STATION_META.get(code)
+        if meta is None:
+            print(f"  SKIP {code}: no metadata")
+            continue
+
+        years = available[code][:max_years]
+        print(f"\n  {meta['name']} ({code}) "
+              f"lat={meta['lat']} lon={meta['lon']} region={meta['region']}")
+        print(f"  Years: {years}")
+
+        station_events = []
+        for year in years:
+            fname = f"{code}_bsrn_{year}.zip"
+            url   = CAELUS_BASE_URL.format(filename=fname)
+            dest  = RAW_BSRN / fname
+
+            ok = download_file(url, dest, fname)
+            if not ok:
+                continue
+
+            print(f"    {fname} ...", end=" ", flush=True)
+            evts = process_caelus_zip(dest, code, meta, year)
+            print(f"{len(evts)} events ({year})")
+            station_events.extend(evts)
+
+        print(f"  → {len(station_events)} events total for {code}")
+        all_events.extend(station_events)
+
+    return all_events
+
+
+# ── SURFRAD ───────────────────────────────────────────────────────────────────
+
+SURFRAD_STATIONS = {
+    "dra": {"name": "Desert Rock",  "lat": 36.626, "lon": -116.018, "dir": "DRA"},
+    "tbl": {"name": "TableMountain","lat": 40.125, "lon": -105.237, "dir": "TBL"},
+}
+SURFRAD_BASE = "https://gml.noaa.gov/aftp/data/radiation/surfrad"
+
+
+def parse_surfrad_day(content: bytes) -> pd.DataFrame:
+    """Parse a SURFRAD daily .dat file to a dataframe with [times_utc, sza, ghi, dif]."""
+    lines = content.decode("utf-8", errors="replace").splitlines()
+    records = []
+    for line in lines[2:]:
+        parts = line.split()
+        if len(parts) < 10:
+            continue
+        try:
+            yr   = int(parts[0])
+            jday = int(parts[1])
+            hr   = int(parts[2])
+            mn   = int(parts[3])
+            zen  = float(parts[5])   # solar zenith angle (degrees)
+            ghi  = float(parts[6])   # downwelling shortwave (W/m²)
+            dif  = float(parts[9])   # diffuse shortwave (W/m²)
+
+            dt = datetime(yr, 1, 1, hr, mn, tzinfo=timezone.utc) + timedelta(days=jday - 1)
+
+            # -9999 = missing
+            records.append({
+                "times_utc": dt,
+                "sza":       None if zen < -900 else zen,
+                "ghi":       None if ghi < -900 else max(0.0, ghi),
+                "dif":       None if dif < -900 else max(0.0, dif),
+            })
+        except (ValueError, IndexError):
+            continue
+    return pd.DataFrame(records)
+
+
+def process_surfrad(years: list[int] | None = None) -> list[dict]:
+    print("\n=== SURFRAD (NOAA GML) ===")
+    if years is None:
+        years = [2018, 2019, 2020]
+
+    all_events = []
+
+    for code, meta in SURFRAD_STATIONS.items():
+        print(f"\n  {meta['name']} ({code}) lat={meta['lat']} lon={meta['lon']}")
+        dest_root = RAW_BSRN / "surfrad" / meta["dir"]
+        dest_root.mkdir(parents=True, exist_ok=True)
+
+        for year in years:
+            year_url = f"{SURFRAD_BASE}/{meta['dir']}/{year}/"
+            try:
+                r = requests.get(year_url, timeout=30)
+                if r.status_code != 200:
+                    print(f"    {year}: HTTP {r.status_code} — skipping")
+                    continue
+                # Parse file listing
+                files = sorted(set(re.findall(rf'{code}\d{{5}}\.dat', r.text)))
+                print(f"    {year}: {len(files)} daily files", end=" ")
+            except Exception as e:
+                print(f"    {year}: ERROR listing: {e}")
+                continue
+
+            dest_dir = dest_root / str(year)
+            dest_dir.mkdir(parents=True, exist_ok=True)
+
+            day_dfs = []
+            for fname in files:
+                dest = dest_dir / fname
+                furl = f"{year_url}{fname}"
+                ok = download_file(furl, dest, fname)
+                if not ok:
+                    continue
+                with open(dest, "rb") as f:
+                    day_df = parse_surfrad_day(f.read())
+                if not day_df.empty:
+                    day_dfs.append(day_df)
+
+            if not day_dfs:
+                print("→ 0 events")
+                continue
+
+            df_year = pd.concat(day_dfs, ignore_index=True)
+            evts = extract_events_from_df(
+                df_year, meta["lat"], meta["lon"],
+                code, meta["name"], f"surfrad_{year}",
+            )
+            print(f"→ {len(evts)} events")
+            all_events.extend(evts)
+
+    return all_events
+
+
+# ── save / summarise ──────────────────────────────────────────────────────────
+
+def save_events(events: list[dict], filename: str) -> Path:
+    out = RAW_SIGHTINGS / filename
+    if not events:
+        print(f"  (no events for {filename})")
+        return out
+    df = pd.DataFrame(events)
+    df = df.sort_values(["station_code", "date", "event_type", "threshold_wm2"])
+    df.to_csv(out, index=False)
+    print(f"  Saved {len(df):,} rows → {out}")
+    return out
+
+
+def summarise(events: list[dict], label: str):
+    print(f"\n{'='*65}")
+    print(f"SUMMARY: {label}")
+    if not events:
+        print("  (no events)")
+        print("=" * 65)
+        return
+    df = pd.DataFrame(events)
+    # Focus summary on primary threshold
+    prim = df[df["threshold_wm2"] == PRIMARY_THRESHOLD]
+    print(f"  All threshold events:    {len(df):,}")
+    print(f"  At {PRIMARY_THRESHOLD} W/m² threshold:  {len(prim):,}")
+    print(f"  Stations:  {df['station_code'].nunique()} — {sorted(df['station_code'].unique())}")
+    if not prim.empty:
+        print(f"  Date range: {prim['date'].min()} to {prim['date'].max()}")
+        for evt_type in ["dawn", "dusk"]:
+            sub = prim[prim["event_type"] == evt_type]
+            if sub.empty:
+                continue
+            d = sub["solar_depression_deg"]
+            print(f"  {evt_type.upper()} events: {len(sub):,} | "
+                  f"mean dep={d.mean():.2f}° std={d.std():.2f}° "
+                  f"p10={d.quantile(0.1):.2f}° p90={d.quantile(0.9):.2f}°")
+    print("=" * 65)
+
+
+# ── main ──────────────────────────────────────────────────────────────────────
+
+def main():
+    print("BSRN Twilight Extractor v2")
+    print(f"Primary threshold: {PRIMARY_THRESHOLD} W/m²")
+    print(f"All thresholds: {THRESHOLDS}")
+    print()
+
+    all_events = []
+
+    # 1. BSRN 5-site
+    five_events = process_five_site()
+    save_events(five_events, "bsrn_5site_twilight.csv")
+    summarise(five_events, "BSRN 5-site")
+    all_events.extend(five_events)
+
+    # 2. CAELUS — priority stations in 20-45°N
+    priority = ["sbo", "tam", "xia", "iza", "ish", "dra", "e13", "tat", "fua",
+                "gcr", "bil", "abs", "lyu", "cnr", "flo"]
+    caelus_events = process_caelus(priority_codes=priority, max_years=5)
+    save_events(caelus_events, "bsrn_caelus_twilight.csv")
+    summarise(caelus_events, "CAELUS")
+    all_events.extend(caelus_events)
+
+    # 3. SURFRAD — Desert Rock (arid, 36.6°N) + Table Mountain
+    surfrad_events = process_surfrad(years=[2018, 2019, 2020])
+    save_events(surfrad_events, "surfrad_twilight.csv")
+    summarise(surfrad_events, "SURFRAD")
+    all_events.extend(surfrad_events)
+
+    # Combined output
+    save_events(all_events, "bsrn_all_twilight.csv")
+    summarise(all_events, "ALL SOURCES")
+
+    print("\nDone.")
+
+
+if __name__ == "__main__":
+    main()
--- a/src/collect/academic_paper_fetcher.py
+++ b/src/collect/academic_paper_fetcher.py
@ -0,0 +1,93 @@
+import requests
+import os
+import re
+from pathlib import Path
+import time
+
+PAPERS_DIR = Path("research/downloaded_papers")
+PAPERS_DIR.mkdir(parents=True, exist_ok=True)
+
+# Queries targeting Islamic twilight observations
+QUERIES = [
+    "fajr twilight observation angle",
+    "subuh sky brightness meter",
+    "shafaq twilight measurement isha",
+    "depression angle fajr isha",
+    "true dawn observation measurements"
+]
+
+def fetch_openalex_papers(query):
+    url = f"https://api.openalex.org/works?search={query}&per-page=50&filter=has_pdf_url:true"
+    print(f"Querying OpenAlex: {query}")
+    try:
+        res = requests.get(url, timeout=15)
+        if res.status_code != 200:
+            return []
+        data = res.json()
+        return data.get("results", [])
+    except Exception as e:
+        print(f"Error fetching {query}: {e}")
+        return []
+
+def sanitize_filename(title):
+    return re.sub(r"[^a-zA-Z0-9]+", "_", title).strip("_")
+
+def main():
+    seen_titles = set()
+    total_downloaded = 0
+    
+    # Check existing files to avoid duplicates
+    existing_files = list(PAPERS_DIR.glob("*.pdf"))
+    for file in existing_files:
+        name_no_ext = file.stem.lower()
+        seen_titles.add(name_no_ext)
+        
+    for query in QUERIES:
+        works = fetch_openalex_papers(query)
+        for work in works:
+            title = work.get("title")
+            if not title:
+                continue
+            
+            clean_title = sanitize_filename(title).lower()
+            
+            # Simple duplicate check
+            if any(clean_title[:30] in seen[:30] for seen in seen_titles):
+                print(f"Skipping duplicate/known: {title}")
+                continue
+                
+            pdf_url = work.get("open_access", {}).get("oa_url")
+            if not pdf_url or not pdf_url.endswith(".pdf"):
+                continue
+                
+            year = work.get("publication_year", "0000")
+            filename = f"{year}_{clean_title[:60]}.pdf"
+            filepath = PAPERS_DIR / filename
+            
+            if filepath.exists():
+                print(f"Already downloaded: {filename}")
+                continue
+                
+            print(f"Downloading: {title} ({year})")
+            print(f"  URL: {pdf_url}")
+            try:
+                # Add headers to mimic browser
+                headers = {'User-Agent': 'Mozilla/5.0'}
+                pdf_res = requests.get(pdf_url, headers=headers, timeout=20)
+                if pdf_res.status_code == 200:
+                    with open(filepath, "wb") as f:
+                        f.write(pdf_res.content)
+                    print(f"  -> Saved as {filename}")
+                    seen_titles.add(clean_title)
+                    total_downloaded += 1
+                else:
+                    print(f"  -> Failed with status {pdf_res.status_code}")
+            except Exception as e:
+                print(f"  -> Download failed: {e}")
+            
+            time.sleep(1) # Be nice to servers
+            
+    print(f"\nTotal new papers downloaded: {total_downloaded}")
+
+if __name__ == "__main__":
+    main()
--- a/src/collect/download_gan_mn.py
+++ b/src/collect/download_gan_mn.py
@ -0,0 +1,226 @@
+"""
+Download all GaN-MN monthly and annual CSV files, then process each one
+with the GaN-MN processor and save results to data/raw/raw_sightings/.
+
+Run from repo root:
+    /Volumes/X9/Sites/acamarata/pray-calc-ml/.venv/bin/python \
+        src/collect/download_gan_mn.py
+"""
+from __future__ import annotations
+
+import csv
+import logging
+import sys
+import time
+from pathlib import Path
+
+import requests
+
+# Add repo root to path so we can import the processor
+REPO_ROOT = Path(__file__).resolve().parent.parent.parent
+sys.path.insert(0, str(REPO_ROOT))
+
+from src.collect.gan_mn_processor import process_gan_mn_csv  # noqa: E402
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+    datefmt="%H:%M:%S",
+)
+log = logging.getLogger(__name__)
+
+RAW_DIR = REPO_ROOT / "data" / "raw" / "gan_mn"
+SIGHTINGS_DIR = REPO_ROOT / "data" / "raw" / "raw_sightings"
+RAW_DIR.mkdir(parents=True, exist_ok=True)
+SIGHTINGS_DIR.mkdir(parents=True, exist_ok=True)
+
+# All monthly files, 2022-2025 (skip already-downloaded ones)
+# Format: (label, bit.ly URL, local filename)
+MONTHLY_FILES = [
+    # 2025 (skip Jun 2025 and Jan 2025 — already have them)
+    ("Aug_2025",  "https://bit.ly/4a9aXIP",  "GaN-MN_August_2025.csv"),
+    ("Sep_2025",  "https://bit.ly/4sm4pxY",  "GaN-MN_September_2025.csv"),
+    ("Jul_2025",  "https://bit.ly/4jlEQtz",  "GaN-MN_July_2025.csv"),
+    ("May_2025",  "https://bit.ly/47w5sCA",  "GaN-MN_May_2025.csv"),
+    ("Apr_2025",  "https://bit.ly/4nxs6Bm",  "GaN-MN_April_2025.csv"),
+    ("Mar_2025",  "https://bit.ly/47mPHzx",  "GaN-MN_March_2025.csv"),
+    ("Feb_2025",  "https://bit.ly/4lbwPXg",  "GaN-MN_February_2025.csv"),
+    # 2024 (skip Aug 2024 — already have it)
+    ("Dec_2024",  "https://bit.ly/4mHSBno",  "GaN-MN_December_2024.csv"),
+    ("Nov_2024",  "https://bit.ly/4lVwidL",  "GaN-MN_November_2024.csv"),
+    ("Oct_2024",  "https://bit.ly/449h3ay",  "GaN-MN_October_2024.csv"),
+    ("Sep_2024",  "https://bit.ly/4hUE458",  "GaN-MN_September_2024.csv"),
+    ("Jul_2024",  "https://bit.ly/4fAjvJi",  "GaN-MN_July_2024.csv"),
+    ("Jun_2024",  "https://bit.ly/3ZvT1DR",  "GaN-MN_June_2024.csv"),
+    ("May_2024",  "https://bit.ly/4f3veR8",  "GaN-MN_May_2024.csv"),
+    ("Apr_2024",  "https://bit.ly/4etfYg0",  "GaN-MN_April_2024.csv"),
+    ("Mar_2024",  "https://bit.ly/3Z69pLy",  "GaN-MN_March_2024.csv"),
+    ("Feb_2024",  "https://bit.ly/46wtyfJ",  "GaN-MN_February_2024.csv"),
+    ("Jan_2024",  "https://bit.ly/4cMztz7",  "GaN-MN_January_2024.csv"),
+    # 2023
+    ("Dec_2023",  "https://bit.ly/45dEDl3",  "GaN-MN_December_2023.csv"),
+    ("Nov_2023",  "https://bit.ly/4ajnV4v",  "GaN-MN_November_2023.csv"),
+    ("Oct_2023",  "https://bit.ly/3PKzryD",  "GaN-MN_October_2023.csv"),
+    ("Sep_2023",  "https://bit.ly/3ww3T8K",  "GaN-MN_September_2023.csv"),
+    ("Aug_2023",  "https://bit.ly/3So1UKV",  "GaN-MN_August_2023.csv"),
+    ("Jul_2023",  "https://bit.ly/3S3mG3u",  "GaN-MN_July_2023.csv"),
+    ("Jun_2023",  "https://bit.ly/46Es4OL",  "GaN-MN_June_2023.csv"),
+    ("May_2023",  "https://bit.ly/46WbF9m",  "GaN-MN_May_2023.csv"),
+    ("Apr_2023",  "https://bit.ly/3rxWWSy",  "GaN-MN_April_2023.csv"),
+    ("Mar_2023",  "https://bit.ly/3OW45nv",  "GaN-MN_March_2023.csv"),
+    ("Feb_2023",  "https://bit.ly/3rSY2br",  "GaN-MN_February_2023.csv"),
+    ("Jan_2023",  "https://bit.ly/3JGJpy1",  "GaN-MN_January_2023.csv"),
+    # 2022
+    ("Dec_2022",  "https://bit.ly/43CelXB",  "GaN-MN_December_2022.csv"),
+    ("Nov_2022",  "https://bit.ly/429RdyR",  "GaN-MN_November_2022.csv"),
+    ("Oct_2022",  "https://bit.ly/3m1onBm",  "GaN-MN_October_2022.csv"),
+    ("Sep_2022",  "http://bit.ly/3SHQJN5",   "GaN-MN_September_2022.csv"),
+    ("Aug_2022",  "http://bit.ly/40hFpdP",   "GaN-MN_August_2022.csv"),
+    ("Jul_2022",  "https://bit.ly/3Z487OD",  "GaN-MN_July_2022.csv"),
+    ("Jun_2022",  "https://bit.ly/3H5CsWS",  "GaN-MN_June_2022.csv"),
+    ("May_2022",  "https://bit.ly/3fnFCJT",  "GaN-MN_May_2022.csv"),
+    ("Apr_2022",  "https://bit.ly/3E8flJr",  "GaN-MN_April_2022.csv"),
+    ("Mar_2022",  "https://bit.ly/3B76VAq",  "GaN-MN_March_2022.csv"),
+    ("Feb_2022",  "http://bit.ly/3BCkXe1",   "GaN-MN_February_2022.csv"),
+    ("Jan_2022",  "http://bit.ly/3IgfkU4",   "GaN-MN_January_2022.csv"),
+]
+
+# Annual file for 2021 (one representative year)
+ANNUAL_FILES = [
+    ("2021", "https://bit.ly/44vtt9T", "GaN-MN_Annual_2021.csv"),
+]
+
+FIELDNAMES = [
+    "prayer", "date_local", "time_local", "utc_offset",
+    "lat", "lng", "elevation_m", "source", "notes",
+]
+
+
+def resolve_and_download(label: str, url: str, dest: Path) -> bool:
+    """
+    Follow bit.ly redirect and download the file to dest.
+    Returns True on success.
+    """
+    if dest.exists():
+        log.info("Already have %s — skipping download", dest.name)
+        return True
+
+    log.info("Downloading %s from %s ...", label, url)
+    try:
+        # bit.ly does a redirect; requests follows by default
+        headers = {
+            "User-Agent": (
+                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
+                "AppleWebKit/537.36 (KHTML, like Gecko) "
+                "Chrome/124.0.0.0 Safari/537.36"
+            )
+        }
+        with requests.get(url, headers=headers, stream=True, timeout=120) as resp:
+            resp.raise_for_status()
+            total = int(resp.headers.get("content-length", 0))
+            downloaded = 0
+            with open(dest, "wb") as fh:
+                for chunk in resp.iter_content(chunk_size=1 << 20):  # 1 MB chunks
+                    if chunk:
+                        fh.write(chunk)
+                        downloaded += len(chunk)
+            mb = downloaded / (1 << 20)
+            log.info("Downloaded %s: %.1f MB", dest.name, mb)
+            return True
+    except Exception as exc:
+        log.error("Failed to download %s: %s", label, exc)
+        if dest.exists():
+            dest.unlink()
+        return False
+
+
+def process_and_save(label: str, csv_path: Path) -> int:
+    """
+    Run the GaN-MN processor on csv_path and write results to
+    data/raw/raw_sightings/gan_mn_{label_lower}.csv.
+    Returns count of records written.
+    """
+    slug = label.lower().replace(" ", "_")
+    out_path = SIGHTINGS_DIR / f"gan_mn_{slug}.csv"
+
+    if out_path.exists():
+        # Count existing rows
+        with open(out_path) as fh:
+            existing = sum(1 for _ in fh) - 1  # subtract header
+        log.info("Already processed %s (%d rows) — skipping", out_path.name, existing)
+        return existing
+
+    log.info("Processing %s ...", csv_path.name)
+    try:
+        records = process_gan_mn_csv(csv_path)
+    except Exception as exc:
+        log.error("Processing failed for %s: %s", csv_path.name, exc)
+        return 0
+
+    if not records:
+        log.warning("No records extracted from %s", csv_path.name)
+        return 0
+
+    with open(out_path, "w", newline="") as fh:
+        writer = csv.DictWriter(fh, fieldnames=FIELDNAMES)
+        writer.writeheader()
+        writer.writerows(records)
+
+    log.info("Wrote %d records to %s", len(records), out_path.name)
+    return len(records)
+
+
+def main() -> None:
+    all_files = MONTHLY_FILES + ANNUAL_FILES
+    total_downloaded = 0
+    total_records = 0
+    skipped_download = 0
+    failed_download = []
+    results = []
+
+    for label, url, filename in all_files:
+        dest = RAW_DIR / filename
+        ok = resolve_and_download(label, url, dest)
+        if not ok:
+            failed_download.append(label)
+            continue
+        if dest.stat().st_size < 1000:
+            log.error("%s appears too small (%d bytes), skipping processing", filename, dest.stat().st_size)
+            failed_download.append(label)
+            continue
+
+        total_downloaded += 1
+        count = process_and_save(label, dest)
+        total_records += count
+        results.append((label, count))
+
+        # Brief pause between files to avoid hammering the network
+        time.sleep(0.5)
+
+    # Also count already-existing sightings from the 3 pre-downloaded months
+    pre_existing = ["august_2024", "january_2025", "june_2025"]
+    pre_total = 0
+    for slug in pre_existing:
+        p = SIGHTINGS_DIR / f"gan_mn_{slug}.csv"
+        if p.exists():
+            with open(p) as fh:
+                n = sum(1 for _ in fh) - 1
+            pre_total += n
+            log.info("Pre-existing %s: %d rows", p.name, n)
+
+    log.info("=" * 60)
+    log.info("Download summary:")
+    log.info("  Files newly processed: %d", total_downloaded)
+    log.info("  Records from new files: %d", total_records)
+    log.info("  Records from pre-existing 3 months: %d", pre_total)
+    log.info("  Grand total twilight events: %d", total_records + pre_total)
+    if failed_download:
+        log.warning("  Failed downloads: %s", ", ".join(failed_download))
+
+    log.info("Per-file breakdown:")
+    for label, count in results:
+        log.info("  %-15s %d", label, count)
+
+
+if __name__ == "__main__":
+    main()
--- a/src/collect/madrid_sqm_processor.py
+++ b/src/collect/madrid_sqm_processor.py
@ -0,0 +1,169 @@
+"""
+Processor for the Madrid Zenodo SQM dataset (Record 4633001).
+
+This script reads data/raw/massive_sqm/SQM_evol.csv, representing 10 years of
+continuous minute-by-minute Sky Quality Meter readings in Madrid, Spain.
+
+We apply the BRIN inflection-point detection algorithm:
+- Fajr: Moment of steepest SQM decline in the pre-dawn window.
+- Isha: Moment SQM reaches within 0.5 mag of the dark-night baseline.
+
+Output: data/raw/raw_sightings/madrid_sqm_10yr.csv
+"""
+
+import math
+from pathlib import Path
+from datetime import datetime, timezone, timedelta
+import pandas as pd
+import ephem
+
+# Madrid UCM Observatory from paper metadata
+LAT = 40.45 
+LNG = -3.73
+ELEVATION_M = 640
+UTC_OFFSET = 1.0  # Winter UTC+1, though data is in UTC so we just use this for local labeling
+
+RAW_CSV = Path("data/raw/massive_sqm/SQM_evol.csv")
+OUT_FILE = Path("data/raw/raw_sightings/madrid_sqm_10yr.csv")
+
+MIN_DARK_SKY_MPSAS = 18.0
+MAX_MOON_ALT = 5.0
+
+def get_solar_altitude(utc_dt: datetime) -> float:
+    obs = ephem.Observer()
+    obs.lat = str(LAT)
+    obs.lon = str(LNG)
+    obs.elevation = ELEVATION_M
+    obs.pressure = 1013.25
+    obs.temp = 15.0
+    if utc_dt.tzinfo is not None:
+        utc_dt = utc_dt.replace(tzinfo=None)
+    obs.date = ephem.Date(utc_dt)
+    sun = ephem.Sun(obs)
+    return math.degrees(float(sun.alt))
+
+def process_madrid_data():
+    if not RAW_CSV.exists():
+        print(f"File not found: {RAW_CSV}")
+        return
+
+    print("Loading Madrid SQM dataset. This may take a minute...")
+    # Format: Time(UTC);Magnitude
+    df = pd.read_csv(RAW_CSV, sep=';', usecols=['Time(UTC)', 'Magnitude'])
+    df['utc_dt'] = pd.to_datetime(df['Time(UTC)']).dt.tz_localize(timezone.utc)
+    df = df.rename(columns={'Magnitude': 'mpsas'})
+    
+    # Filter bad data
+    df = df[df['mpsas'] > 0]
+    df = df.sort_values('utc_dt').reset_index(drop=True)
+
+    print(f"Loaded {len(df)} rows. Processing nights...")
+    
+    # Identify unique local calendar dates in the dataset
+    df['local_date'] = (df['utc_dt'] + timedelta(hours=UTC_OFFSET)).dt.date
+    unique_dates = df['local_date'].unique()
+    
+    records = []
+    
+    obs = ephem.Observer()
+    obs.lat = str(LAT)
+    obs.lon = str(LNG)
+    obs.elevation = ELEVATION_M
+    obs.pressure = 1013.25
+    obs.temp = 15.0
+
+    print(f"Found {len(unique_dates)} nights to process.")
+    for local_date in unique_dates:
+        # Midnight UTC for the given local date
+        noon_utc = datetime(local_date.year, local_date.month, local_date.day, 12, 0, tzinfo=timezone.utc) - timedelta(hours=UTC_OFFSET)
+        
+        obs.date = ephem.Date(noon_utc.replace(tzinfo=None))
+        try:
+            sunset_utc = obs.next_setting(ephem.Sun()).datetime().replace(tzinfo=timezone.utc)
+            sunrise_utc = obs.next_rising(ephem.Sun()).datetime().replace(tzinfo=timezone.utc)
+        except ephem.AlwaysUpError:
+            continue
+        except ephem.NeverUpError:
+            continue
+
+        # FAJR
+        window_start = sunrise_utc - timedelta(hours=5)
+        window_end = sunrise_utc - timedelta(minutes=10)
+        predawn = df[(df['utc_dt'] >= window_start) & (df['utc_dt'] <= window_end)].copy()
+        
+        if len(predawn) >= 30:
+            predawn['sun_alt'] = [get_solar_altitude(dt) for dt in predawn['utc_dt']]
+            predawn = predawn[predawn['sun_alt'] < -5.0]
+            
+            early = predawn[predawn['sun_alt'] < -18.0]
+            if early.empty: 
+                early = predawn[predawn['sun_alt'] < -16.0]
+            if not early.empty and early['mpsas'].max() >= MIN_DARK_SKY_MPSAS:
+                predawn['mpsas_smooth'] = predawn['mpsas'].rolling(window=5, center=True, min_periods=3).mean()
+                predawn['dmpsas'] = predawn['mpsas_smooth'].diff()
+                
+                active = predawn[(predawn['sun_alt'] >= -25.0) & (predawn['sun_alt'] <= -5.0)]
+                if len(active) >= 10:
+                    steepest_idx = active['dmpsas'].idxmin()
+                    if not pd.isna(steepest_idx):
+                        dawn_row = predawn.loc[steepest_idx]
+                        depression_angle = -dawn_row['sun_alt']
+                        
+                        if 10.0 <= depression_angle <= 22.0:
+                            local_time = dawn_row['utc_dt'] + timedelta(hours=UTC_OFFSET)
+                            records.append({
+                                "prayer": "fajr",
+                                "date_local": local_time.strftime("%Y-%m-%d"),
+                                "time_local": local_time.strftime("%H:%M"),
+                                "utc_offset": UTC_OFFSET,
+                                "lat": LAT,
+                                "lng": LNG,
+                                "elevation_m": ELEVATION_M,
+                                "source": "Madrid Zenodo SQM (10-year)",
+                                "notes": f"SQM inflection point Fajr detection; Madrid UCM Observatory; dark-sky baseline>={MIN_DARK_SKY_MPSAS}"
+                            })
+
+        # ISHA
+        window_end_isha = sunset_utc + timedelta(hours=5)
+        evening = df[(df['utc_dt'] >= sunset_utc) & (df['utc_dt'] <= window_end_isha)].copy()
+        
+        if len(evening) >= 30:
+            evening['sun_alt'] = [get_solar_altitude(dt) for dt in evening['utc_dt']]
+            evening = evening[evening['sun_alt'] < 0]
+            
+            deep_night = evening[(evening['utc_dt'] >= sunset_utc + timedelta(hours=2)) & 
+                               (evening['utc_dt'] <= sunset_utc + timedelta(hours=6)) &
+                               (evening['sun_alt'] < -18.0)]
+                               
+            if len(deep_night) >= 10:
+                dark_baseline = deep_night['mpsas'].median()
+                if dark_baseline >= MIN_DARK_SKY_MPSAS:
+                    evening['mpsas_smooth'] = evening['mpsas'].rolling(window=5, center=True, min_periods=3).mean()
+                    threshold = dark_baseline - 0.5
+                    
+                    reached = evening[(evening['mpsas_smooth'] >= threshold) & (evening['sun_alt'] < -12.0)]
+                    if not reached.empty:
+                        isha_row = reached.iloc[0]
+                        depression_angle = -isha_row['sun_alt']
+                        
+                        if 12.0 <= depression_angle <= 22.0:
+                            local_time = isha_row['utc_dt'] + timedelta(hours=UTC_OFFSET)
+                            records.append({
+                                "prayer": "isha",
+                                "date_local": local_time.strftime("%Y-%m-%d"),
+                                "time_local": local_time.strftime("%H:%M"),
+                                "utc_offset": UTC_OFFSET,
+                                "lat": LAT,
+                                "lng": LNG,
+                                "elevation_m": ELEVATION_M,
+                                "source": "Madrid Zenodo SQM (10-year)",
+                                "notes": f"SQM threshold Isha detection (Shafaq Abyad); dark-sky baseline={dark_baseline:.2f}"
+                            })
+
+    out_df = pd.DataFrame(records)
+    OUT_FILE.parent.mkdir(parents=True, exist_ok=True)
+    out_df.to_csv(OUT_FILE, index=False)
+    print(f"Successfully processed {len(records)} Fajr/Isha observations to {OUT_FILE}.")
+
+if __name__ == "__main__":
+    process_madrid_data()
--- a/src/surfrad_process.py
+++ b/src/surfrad_process.py
@ -0,0 +1,282 @@
+"""
+SURFRAD Downloader and Twilight Extractor — standalone.
+
+Stations:
+- dra: Desert Rock, Nevada (36.626°N, 116.018°W) — arid, excellent visibility
+- tbl: Table Mountain, Colorado (40.125°N, 105.237°W) — high altitude (1689m)
+- gwn: Goodwin Creek, Mississippi (34.255°N, 89.873°W) — humid subtropical
+
+File naming: {code}{YY}{DDD}.dat
+  YY  = 2-digit year
+  DDD = day of year (001-366)
+
+Column layout (SURFRAD standard format, v1):
+  0: year
+  1: jday (day of year)
+  2: month
+  3: day
+  4: hour (UTC)
+  5: min
+  6: dt (time difference in minutes from standard time)
+  7: zen (solar zenith angle, degrees)
+  8: dw_solar (downwelling shortwave, W/m²) ← GHI
+  9: dw_solar_sd
+ 10: direct (direct normal irradiance, W/m²)
+ 11: direct_sd
+ 12: diffuse (diffuse horizontal, W/m²) ← DIF
+ 13: diffuse_sd
+ 14+: other measurements
+
+Missing value: -9999.9
+"""
+
+import re
+import math
+import requests
+import pandas as pd
+import numpy as np
+from datetime import datetime, timezone, timedelta, date
+from pathlib import Path
+
+BASE = Path("/Volumes/X9/Sites/acamarata/pray-calc-ml")
+RAW_BSRN = BASE / "data/raw/bsrn"
+RAW_SIGHTINGS = BASE / "data/raw/raw_sightings"
+
+THRESHOLDS = [0.5, 1.0, 2.0, 5.0, 10.0]
+SURFRAD_BASE = "https://gml.noaa.gov/aftp/data/radiation/surfrad"
+
+STATIONS = {
+    "dra": {"name": "Desert Rock",   "lat": 36.626,  "lon": -116.018, "dir": "dra"},
+    "tbl": {"name": "TableMountain", "lat": 40.125,  "lon": -105.237, "dir": "tbl"},
+    "gwn": {"name": "GoodwinCreek",  "lat": 34.2547, "lon": -89.8729, "dir": "gwn"},
+}
+
+
+def download_file(url: str, dest: Path) -> bool:
+    if dest.exists() and dest.stat().st_size > 500:
+        return True
+    try:
+        r = requests.get(url, timeout=60)
+        r.raise_for_status()
+        dest.write_bytes(r.content)
+        return True
+    except Exception:
+        return False
+
+
+def parse_surfrad_dat(content: bytes) -> pd.DataFrame:
+    """
+    Parse SURFRAD .dat file.
+    Returns df with [times_utc, sza, ghi, dif].
+    """
+    lines = content.decode("utf-8", errors="replace").splitlines()
+
+    # First 2 lines are headers; data starts line 2
+    records = []
+    for line in lines[2:]:
+        parts = line.split()
+        if len(parts) < 13:
+            continue
+        try:
+            yr   = int(parts[0])
+            jday = int(parts[1])
+            hr   = int(parts[4])
+            mn   = int(parts[5])
+            zen  = float(parts[7])
+            ghi  = float(parts[8])
+            dif  = float(parts[12])
+
+            dt = datetime(yr, 1, 1, hr, mn, tzinfo=timezone.utc) + timedelta(days=jday - 1)
+            records.append({
+                "times_utc": dt,
+                "sza":       None if zen < -900 else zen,
+                "ghi":       None if ghi < -900 else max(0.0, ghi),
+                "dif":       None if dif < -900 else max(0.0, dif),
+            })
+        except (ValueError, IndexError):
+            continue
+
+    return pd.DataFrame(records)
+
+
+def extract_events(df: pd.DataFrame, lat: float, lon: float,
+                   station_code: str, station_name: str,
+                   source: str) -> list[dict]:
+    """Same logic as bsrn_process.py extract_events_from_df."""
+    events = []
+    df = df.copy()
+    df["times_utc"] = pd.to_datetime(df["times_utc"], utc=True, errors="coerce")
+    df = df.dropna(subset=["times_utc"]).sort_values("times_utc").reset_index(drop=True)
+
+    # Use diffuse as primary signal, fall back to GHI
+    df["irr"] = pd.to_numeric(df.get("dif", pd.Series()), errors="coerce")
+    if "ghi" in df.columns:
+        df["irr"] = df["irr"].combine_first(pd.to_numeric(df["ghi"], errors="coerce"))
+
+    df["sza"] = pd.to_numeric(df["sza"], errors="coerce")
+    df["date"] = df["times_utc"].dt.date
+
+    for day, grp in df.groupby("date"):
+        grp = grp.sort_values("times_utc").reset_index(drop=True)
+        irr = grp["irr"].values
+        sza = grp["sza"].values
+        ts  = grp["times_utc"].values
+        n   = len(irr)
+        if n < 10:
+            continue
+
+        for threshold in THRESHOLDS:
+            dawn_idx = None
+            for i in range(2, n):
+                if (not np.isnan(irr[i]) and irr[i] >= threshold
+                        and not np.isnan(sza[i]) and sza[i] > 90.0):
+                    prev_ok = all(
+                        (np.isnan(v) or v < threshold)
+                        for v in irr[max(0, i-3):i]
+                    )
+                    if prev_ok:
+                        dawn_idx = i
+                        break
+
+            dusk_idx = None
+            for i in range(n - 3, -1, -1):
+                if (not np.isnan(irr[i]) and irr[i] >= threshold
+                        and not np.isnan(sza[i]) and sza[i] > 90.0):
+                    after_ok = all(
+                        (np.isnan(v) or v < threshold)
+                        for v in irr[i+1:min(n, i+4)]
+                    )
+                    if after_ok:
+                        dusk_idx = i
+                        break
+
+            for idx, event_type in [(dawn_idx, "dawn"), (dusk_idx, "dusk")]:
+                if idx is None:
+                    continue
+                sza_val = float(sza[idx])
+                if sza_val <= 90.0:
+                    continue
+                dep = round(sza_val - 90.0, 4)
+                if dep < 0.3 or dep > 20.0:
+                    continue
+                utc_ts   = pd.Timestamp(ts[idx])
+                irr_val  = float(irr[idx])
+                ghi_at   = float(grp["ghi"].iloc[idx]) if "ghi" in grp.columns else float("nan")
+                dif_at   = float(grp["dif"].iloc[idx]) if "dif" in grp.columns else float("nan")
+
+                events.append({
+                    "date":                 str(day),
+                    "event_type":           event_type,
+                    "utc_time":             utc_ts.strftime("%H:%M:%S"),
+                    "sza_deg":              round(sza_val, 4),
+                    "solar_depression_deg": dep,
+                    "irr_wm2":              round(irr_val, 2),
+                    "ghi_wm2":              round(ghi_at, 2) if not math.isnan(ghi_at) else None,
+                    "dif_wm2":              round(dif_at, 2)  if not math.isnan(dif_at) else None,
+                    "threshold_wm2":        threshold,
+                    "lat":                  lat,
+                    "lon":                  lon,
+                    "station_code":         station_code,
+                    "station_name":         station_name,
+                    "source":               source,
+                })
+
+    return events
+
+
+def process_station(code: str, meta: dict, years: list[int]) -> list[dict]:
+    print(f"\n  {meta['name']} ({code}) lat={meta['lat']} lon={meta['lon']}")
+    dir_code = meta["dir"]
+    dest_root = RAW_BSRN / "surfrad" / dir_code
+    dest_root.mkdir(parents=True, exist_ok=True)
+
+    all_events = []
+    for year in years:
+        yy = str(year)[2:]  # 2-digit year
+        # Build list of days
+        import calendar
+        days_in_year = 366 if calendar.isleap(year) else 365
+
+        dest_dir = dest_root / str(year)
+        dest_dir.mkdir(parents=True, exist_ok=True)
+
+        day_dfs = []
+        success_count = 0
+        for doy in range(1, days_in_year + 1):
+            fname = f"{dir_code}{yy}{doy:03d}.dat"
+            dest  = dest_dir / fname
+            url   = f"{SURFRAD_BASE}/{dir_code}/{year}/{fname}"
+
+            ok = download_file(url, dest)
+            if not ok:
+                continue
+            with open(dest, "rb") as f:
+                day_df = parse_surfrad_dat(f.read())
+            if not day_df.empty:
+                day_dfs.append(day_df)
+                success_count += 1
+
+        if not day_dfs:
+            print(f"    {year}: 0 days downloaded")
+            continue
+
+        df_year = pd.concat(day_dfs, ignore_index=True)
+        evts = extract_events(df_year, meta["lat"], meta["lon"],
+                              code, meta["name"], f"surfrad_{year}")
+        print(f"    {year}: {success_count} days → {len(evts)} twilight events "
+              f"({len(df_year):,} 1-min records)")
+        all_events.extend(evts)
+
+    return all_events
+
+
+def main():
+    print("SURFRAD Twilight Extractor")
+    years = [2018, 2019, 2020]
+    print(f"Years: {years}")
+    print(f"Stations: {list(STATIONS.keys())}")
+
+    all_events = []
+    for code, meta in STATIONS.items():
+        evts = process_station(code, meta, years)
+        all_events.extend(evts)
+        print(f"  → {len(evts)} total events for {code}")
+
+    # Save
+    out = RAW_SIGHTINGS / "surfrad_twilight.csv"
+    if all_events:
+        df = pd.DataFrame(all_events)
+        df = df.sort_values(["station_code", "date", "event_type", "threshold_wm2"])
+        df.to_csv(out, index=False)
+        print(f"\nSaved {len(df):,} rows → {out}")
+
+        prim = df[df["threshold_wm2"] == 1.0]
+        print(f"\nSUMMARY (1.0 W/m² threshold):")
+        print(f"  Total: {len(prim):,} events")
+        for evt in ["dawn", "dusk"]:
+            sub = prim[prim["event_type"] == evt]
+            if sub.empty:
+                continue
+            d = sub["solar_depression_deg"]
+            print(f"  {evt.upper()}: {len(sub):,} | "
+                  f"mean={d.mean():.2f}° p10={d.quantile(0.1):.2f}° "
+                  f"p90={d.quantile(0.9):.2f}°")
+    else:
+        print("No events extracted.")
+
+    # Merge into bsrn_all_twilight.csv
+    all_file = RAW_SIGHTINGS / "bsrn_all_twilight.csv"
+    if all_file.exists() and all_events:
+        df_existing = pd.read_csv(all_file)
+        # Remove old surfrad rows if any
+        df_existing = df_existing[~df_existing["source"].str.startswith("surfrad", na=False)]
+        df_new = pd.concat([df_existing, pd.DataFrame(all_events)], ignore_index=True)
+        df_new = df_new.sort_values(["station_code", "date", "event_type", "threshold_wm2"])
+        df_new.to_csv(all_file, index=False)
+        print(f"\nUpdated bsrn_all_twilight.csv → {len(df_new):,} total rows")
+
+    print("\nDone.")
+
+
+if __name__ == "__main__":
+    main()
--- a/src/surfrad_process2.py
+++ b/src/surfrad_process2.py
@ -0,0 +1,204 @@
+"""
+SURFRAD processor — reads already-downloaded files in DRA/ and TBL/ dirs.
+Also downloads remaining missing days.
+"""
+
+import math
+import requests
+import pandas as pd
+import numpy as np
+import calendar
+from datetime import datetime, timezone, timedelta
+from pathlib import Path
+
+BASE = Path("/Volumes/X9/Sites/acamarata/pray-calc-ml")
+RAW_BSRN = BASE / "data/raw/bsrn"
+RAW_SIGHTINGS = BASE / "data/raw/raw_sightings"
+THRESHOLDS = [0.5, 1.0, 2.0, 5.0, 10.0]
+SURFRAD_BASE = "https://gml.noaa.gov/aftp/data/radiation/surfrad"
+
+STATIONS = {
+    "dra": {"name": "Desert Rock",   "lat": 36.626,  "lon": -116.018, "dir_code": "DRA", "file_prefix": "dra"},
+    "tbl": {"name": "TableMountain", "lat": 40.125,  "lon": -105.237, "dir_code": "TBL", "file_prefix": "tbl"},
+}
+YEARS = [2018, 2019, 2020]
+
+
+def download_file(url: str, dest: Path) -> bool:
+    if dest.exists() and dest.stat().st_size > 200:
+        return True
+    try:
+        r = requests.get(url, timeout=30)
+        if r.status_code != 200:
+            return False
+        dest.write_bytes(r.content)
+        return True
+    except Exception:
+        return False
+
+
+def parse_surfrad_dat(fpath: Path) -> pd.DataFrame:
+    lines = fpath.read_bytes().decode("utf-8", errors="replace").splitlines()
+    records = []
+    for line in lines[2:]:
+        parts = line.split()
+        if len(parts) < 13:
+            continue
+        try:
+            yr   = int(parts[0])
+            jday = int(parts[1])
+            hr   = int(parts[4])
+            mn   = int(parts[5])
+            zen  = float(parts[7])
+            ghi  = float(parts[8])
+            dif  = float(parts[12])
+            dt   = datetime(yr, 1, 1, hr, mn, tzinfo=timezone.utc) + timedelta(days=jday - 1)
+            records.append({
+                "times_utc": dt,
+                "sza":  None if zen < -900 else zen,
+                "ghi":  None if ghi < -900 else max(0.0, ghi),
+                "dif":  None if dif < -900 else max(0.0, dif),
+            })
+        except (ValueError, IndexError):
+            continue
+    return pd.DataFrame(records)
+
+
+def extract_events(df, lat, lon, code, name, source):
+    events = []
+    df = df.copy()
+    df["times_utc"] = pd.to_datetime(df["times_utc"], utc=True, errors="coerce")
+    df = df.dropna(subset=["times_utc"]).sort_values("times_utc").reset_index(drop=True)
+    df["irr"] = pd.to_numeric(df.get("dif", pd.Series(dtype=float)), errors="coerce")
+    if "ghi" in df.columns:
+        df["irr"] = df["irr"].combine_first(pd.to_numeric(df["ghi"], errors="coerce"))
+    df["sza"] = pd.to_numeric(df["sza"], errors="coerce")
+    df["date"] = df["times_utc"].dt.date
+
+    for day, grp in df.groupby("date"):
+        grp = grp.sort_values("times_utc").reset_index(drop=True)
+        irr = grp["irr"].values
+        sza = grp["sza"].values
+        ts  = grp["times_utc"].values
+        n   = len(irr)
+        if n < 10:
+            continue
+        for threshold in THRESHOLDS:
+            dawn_idx = None
+            for i in range(2, n):
+                if (not np.isnan(irr[i]) and irr[i] >= threshold
+                        and not np.isnan(sza[i]) and sza[i] > 90.0
+                        and all((np.isnan(v) or v < threshold) for v in irr[max(0,i-3):i])):
+                    dawn_idx = i
+                    break
+            dusk_idx = None
+            for i in range(n-3, -1, -1):
+                if (not np.isnan(irr[i]) and irr[i] >= threshold
+                        and not np.isnan(sza[i]) and sza[i] > 90.0
+                        and all((np.isnan(v) or v < threshold) for v in irr[i+1:min(n,i+4)])):
+                    dusk_idx = i
+                    break
+            for idx, etype in [(dawn_idx, "dawn"), (dusk_idx, "dusk")]:
+                if idx is None:
+                    continue
+                sza_val = float(sza[idx])
+                if sza_val <= 90.0:
+                    continue
+                dep = round(sza_val - 90.0, 4)
+                if dep < 0.3 or dep > 20.0:
+                    continue
+                utc_ts = pd.Timestamp(ts[idx])
+                ghi_at = float(grp["ghi"].iloc[idx]) if "ghi" in grp.columns else float("nan")
+                dif_at = float(grp["dif"].iloc[idx]) if "dif" in grp.columns else float("nan")
+                events.append({
+                    "date": str(day), "event_type": etype,
+                    "utc_time": utc_ts.strftime("%H:%M:%S"),
+                    "sza_deg": round(sza_val, 4),
+                    "solar_depression_deg": dep,
+                    "irr_wm2": round(float(irr[idx]), 2),
+                    "ghi_wm2": round(ghi_at, 2) if not math.isnan(ghi_at) else None,
+                    "dif_wm2": round(dif_at, 2)  if not math.isnan(dif_at) else None,
+                    "threshold_wm2": threshold,
+                    "lat": lat, "lon": lon,
+                    "station_code": code, "station_name": name, "source": source,
+                })
+    return events
+
+
+def main():
+    print("SURFRAD Processor v2 — using cached downloads + filling gaps")
+    all_events = []
+
+    for code, meta in STATIONS.items():
+        print(f"\n{meta['name']} ({code})")
+        dir_code   = meta["dir_code"]
+        prefix     = meta["file_prefix"]
+        dest_root  = RAW_BSRN / "surfrad" / dir_code
+
+        for year in YEARS:
+            yy = str(year)[2:]
+            days_in_year = 366 if calendar.isleap(year) else 365
+            dest_dir = dest_root / str(year)
+            dest_dir.mkdir(parents=True, exist_ok=True)
+
+            # Download any missing days
+            missing = 0
+            for doy in range(1, days_in_year + 1):
+                fname = f"{prefix}{yy}{doy:03d}.dat"
+                dest  = dest_dir / fname
+                if dest.exists() and dest.stat().st_size > 200:
+                    continue
+                url = f"{SURFRAD_BASE}/{prefix}/{year}/{fname}"
+                if download_file(url, dest):
+                    missing += 1
+
+            # Parse all available files
+            day_dfs = []
+            for fpath in sorted(dest_dir.glob(f"{prefix}{yy}*.dat")):
+                day_df = parse_surfrad_dat(fpath)
+                if not day_df.empty:
+                    day_dfs.append(day_df)
+
+            if not day_dfs:
+                print(f"  {year}: no data files found")
+                continue
+
+            df_year = pd.concat(day_dfs, ignore_index=True)
+            evts = extract_events(df_year, meta["lat"], meta["lon"],
+                                  code, meta["name"], f"surfrad_{year}")
+            print(f"  {year}: {len(day_dfs)} days, {len(df_year):,} records → {len(evts)} events (downloaded {missing} new files)")
+            all_events.extend(evts)
+
+    out = RAW_SIGHTINGS / "surfrad_twilight.csv"
+    if all_events:
+        df = pd.DataFrame(all_events).sort_values(
+            ["station_code", "date", "event_type", "threshold_wm2"])
+        df.to_csv(out, index=False)
+        print(f"\nSaved {len(df):,} rows → {out}")
+
+        prim = df[df["threshold_wm2"] == 1.0]
+        print(f"\nSummary at 1 W/m²: {len(prim):,} events")
+        for etype in ["dawn", "dusk"]:
+            sub = prim[prim["event_type"] == etype]
+            if not sub.empty:
+                d = sub["solar_depression_deg"]
+                print(f"  {etype}: {len(sub):,} | mean={d.mean():.2f}° "
+                      f"p10={d.quantile(0.1):.2f}° p90={d.quantile(0.9):.2f}°")
+
+        # Update combined file
+        combined = RAW_SIGHTINGS / "bsrn_all_twilight.csv"
+        if combined.exists():
+            df_all = pd.read_csv(combined)
+            df_all = df_all[~df_all["source"].str.startswith("surfrad", na=False)]
+            df_all = pd.concat([df_all, pd.DataFrame(all_events)], ignore_index=True)
+            df_all = df_all.sort_values(["station_code", "date", "event_type", "threshold_wm2"])
+            df_all.to_csv(combined, index=False)
+            print(f"Updated bsrn_all_twilight.csv → {len(df_all):,} total rows")
+    else:
+        print("No events.")
+
+    print("\nDone.")
+
+
+if __name__ == "__main__":
+    main()