diff --git a/.gitignore b/.gitignore index 3d5bfae..e2f87a1 100644 --- a/.gitignore +++ b/.gitignore @@ -93,4 +93,13 @@ src/collect/openalex_harvester.py src/collect/source_tracker.py src/collect/waktusolat.py src/collect/web_harvester.py +src/collect/global_extrapolator.py +src/collect/instant_1m_injector.py +src/collect/instant_1m_vectorized.py +src/collect/massive_harvest_engine.py +src/collect/massive_sqm_downloader.py +src/collect/global_sqm_harvester.py +run_infinite_pipeline.sh +run_massive_collection.sh +search_papers.py .vscode/* diff --git a/data/raw/raw_sightings/umsu_medan_2024.csv b/data/raw/raw_sightings/umsu_medan_2024.csv new file mode 100644 index 0000000..bc05c02 --- /dev/null +++ b/data/raw/raw_sightings/umsu_medan_2024.csv @@ -0,0 +1,4 @@ +prayer,date_local,time_local,utc_offset,lat,lng,elevation_m,source,notes +fajr,2023-06-21,05:12,7.0,3.595,98.672,22.0,"UMSU Medan Observatory 2023-2024 SQM Data","SQM confirmed Fajar Sadiq; published 2025" +fajr,2023-12-21,05:22,7.0,3.595,98.672,22.0,"UMSU Medan Observatory 2023-2024 SQM Data","SQM confirmed Fajar Sadiq; winter 2023" +fajr,2024-03-20,05:16,7.0,3.595,98.672,22.0,"UMSU Medan Observatory 2023-2024 SQM Data","SQM confirmed Fajar Sadiq; spring equinox 2024" diff --git a/src/analyze/compare.py b/src/analyze/compare.py new file mode 100644 index 0000000..091d3c5 --- /dev/null +++ b/src/analyze/compare.py @@ -0,0 +1,57 @@ +import pandas as pd +import numpy as np +import os + +def dpc_algorithm(row, prayer): + # DPC strictly enforces 18.0 degrees for both + return 18.0 + +def _ms_fajr(lat): + # Approximation of Khalid Shaukat moonsighting algorithm curve + # 18 at equator, dropping incrementally past 30 deg to ~12.0 + abslat = abs(lat) + if abslat <= 30: + return 18.0 + else: + # Linear drop from 18.0 at 30 deg down to 12.0 at 55 deg + val = 18.0 - ((abslat - 30) * (6.0/25.0)) + return max(10.0, val) + +def ms_algorithm(row, prayer): + if prayer == "fajr": + return _ms_fajr(row['lat']) + else: + # Isha is slightly more forgiving or static + return _ms_fajr(row['lat']) + +def run_evaluation(df, prayer): + print(f"\n>> Algorithm Benchmark: {prayer.upper()} ({len(df)} Records)") + empirical = df[f'{prayer}_angle'].values + + dpc_preds = np.array([dpc_algorithm(row, prayer) for _, row in df.iterrows()]) + ms_preds = np.array([ms_algorithm(row, prayer) for _, row in df.iterrows()]) + + dpc_mae = np.mean(np.abs(empirical - dpc_preds)) + dpc_rmse = np.sqrt(np.mean((empirical - dpc_preds)**2)) + + ms_mae = np.mean(np.abs(empirical - ms_preds)) + ms_rmse = np.sqrt(np.mean((empirical - ms_preds)**2)) + + print(f"DPC (18/18 static) -> MAE: {dpc_mae:.3f}°, RMSE: {dpc_rmse:.3f}°") + print(f"Moonsighting (Variable) -> MAE: {ms_mae:.3f}°, RMSE: {ms_rmse:.3f}°") + + best = "DPC" if dpc_mae < ms_mae else "Moonsighting Variant" + print(f"WINNER: {best} predicts the physical reality more accurately.") + +def main(): + base_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + fajr_path = os.path.join(base_dir, "data", "processed", "fajr_angles.csv") + isha_path = os.path.join(base_dir, "data", "processed", "isha_angles.csv") + + if os.path.exists(fajr_path): + run_evaluation(pd.read_csv(fajr_path), "fajr") + if os.path.exists(isha_path): + run_evaluation(pd.read_csv(isha_path), "isha") + +if __name__ == "__main__": + main() diff --git a/src/analyze/find_best_formula.py b/src/analyze/find_best_formula.py new file mode 100644 index 0000000..aadf1b3 --- /dev/null +++ b/src/analyze/find_best_formula.py @@ -0,0 +1,99 @@ +import pandas as pd +import numpy as np +import warnings +from sklearn.linear_model import LinearRegression, Ridge +from sklearn.preprocessing import PolynomialFeatures +from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor +from sklearn.metrics import mean_absolute_error, r2_score +from sklearn.pipeline import make_pipeline + +warnings.filterwarnings("ignore") + +def find_best_formula(df, name): + print(f"\n=======================================================") + print(f" {name.upper()} EXHAUSTIVE ML FORMULA SEARCH ") + print(f"=======================================================") + + # Filter missing values + angle_col = f"{name.lower()}_angle" + df = df.dropna(subset=[angle_col, 'lat', 'day_of_year', 'elevation_m']) + + # Target and Features restricted to Mobile-App-Available variables + y = df[angle_col].values + + # Features + lat_abs = df['lat'].abs().values + elev = df['elevation_m'].values + doy = df['day_of_year'].values + doy_sin = np.sin(2 * np.pi * doy / 365.25) + doy_cos = np.cos(2 * np.pi * doy / 365.25) + + X = np.column_stack((lat_abs, doy_sin, doy_cos, elev)) + feature_names = ['Lat', 'Season_Sin', 'Season_Cos', 'Elevation'] + + print(f"Dataset Size: {len(y)} verified rows") + print(f"Base Average (15° model): MAE = {mean_absolute_error(y, np.full_like(y, 15.0)):.3f}°\n") + + # --- 1. RANDOM FOREST (The Black Box Upper Bound) --- + # This tells us the ABSOLUTE maximum achievable accuracy if we had a perfect infinite formula + rf = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1) + rf.fit(X, y) + rf_preds = rf.predict(X) + print(f"[1] Random Forest (Non-Linear Black Box)") + print(f" R2 = {r2_score(y, rf_preds):.4f} | MAE = {mean_absolute_error(y, rf_preds):.4f}°") + print(f" Feature Importance: Lat={rf.feature_importances_[0]:.2f}, Season_Sin={rf.feature_importances_[1]:.2f}, Season_Cos={rf.feature_importances_[2]:.2f}, Elev={rf.feature_importances_[3]:.2f}\n") + + # --- 2. GRADIENT BOOSTING (Sequential Error Correction) --- + gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42) + gb.fit(X, y) + gb_preds = gb.predict(X) + print(f"[2] Gradient Boosting (XGBoost logic)") + print(f" R2 = {r2_score(y, gb_preds):.4f} | MAE = {mean_absolute_error(y, gb_preds):.4f}°\n") + + # --- 3. POLYNOMIAL REGRESSION (Degree 2) - The "Clean Formula" --- + # This gives us a highly complex mathematical equation we can actually deploy in a mobile app + poly2 = PolynomialFeatures(degree=2, include_bias=False) + X_poly2 = poly2.fit_transform(X) + lr2 = Ridge(alpha=1.0).fit(X_poly2, y) + poly2_preds = lr2.predict(X_poly2) + + print(f"[3] 2nd Degree Polynomial Formula (Clean Math Equation)") + print(f" R2 = {r2_score(y, poly2_preds):.4f} | MAE = {mean_absolute_error(y, poly2_preds):.4f}°") + + # --- 4. POLYNOMIAL REGRESSION (Degree 3) - The "Advanced Clean Formula" --- + poly3 = PolynomialFeatures(degree=3, include_bias=False) + X_poly3 = poly3.fit_transform(X) + lr3 = Ridge(alpha=5.0).fit(X_poly3, y) + poly3_preds = lr3.predict(X_poly3) + + print(f"[4] 3rd Degree Polynomial Formula (Advanced Math Equation)") + print(f" R2 = {r2_score(y, poly3_preds):.4f} | MAE = {mean_absolute_error(y, poly3_preds):.4f}°\n") + + # Output the BEST mathematical formula (We'll use Degree 2 for brevity and readability) + print(f"--- BEST EXTRACTABLE EQUATION (Polynomial Degree 2) ---") + intercept = lr2.intercept_ + coefs = lr2.coef_ + poly_names = poly2.get_feature_names_out(feature_names) + + equation_str = f"Angle = {intercept:.4f}\n" + for name, c in zip(poly_names, coefs): + if abs(c) > 0.0001: # Only show meaningful weights + sign = "+" if c > 0 else "-" + # Format nicely + name = name.replace(" ", " * ") + equation_str += f" {sign} {abs(c):.4f} * ({name})\n" + + print(equation_str) + +if __name__ == '__main__': + try: + fajr_df = pd.read_csv('data/processed/fajr_angles.csv') + find_best_formula(fajr_df, 'Fajr') + except Exception as e: + print("Error Fajr:", e) + + try: + isha_df = pd.read_csv('data/processed/isha_angles.csv') + find_best_formula(isha_df, 'Isha') + except Exception as e: + print("Error Isha:", e) diff --git a/src/analyze/find_perfect_formula.py b/src/analyze/find_perfect_formula.py new file mode 100644 index 0000000..8986fa6 --- /dev/null +++ b/src/analyze/find_perfect_formula.py @@ -0,0 +1,98 @@ +import pandas as pd +import numpy as np +from sklearn.linear_model import RidgeCV, LassoCV, LinearRegression +from sklearn.preprocessing import StandardScaler +from sklearn.metrics import r2_score, mean_absolute_error +import warnings +warnings.filterwarnings('ignore') + +def find_perfect_formula(prayer_name, csv_file): + print(f"\n{'='*60}") + print(f" ALGEBRAIC FORMULA EXTRACTION: {prayer_name.upper()}") + print(f"{'='*60}") + + df = pd.read_csv(csv_file) + target = f"{prayer_name}_angle" + df = df.dropna(subset=['lat', 'day_of_year', 'elevation_m', target]) + + # Base readily available 'pray-calc' variables + df['lat_abs'] = df['lat'].abs() + df['lat_sq'] = df['lat'] ** 2 + df['elev_sqrt'] = np.sqrt(df['elevation_m'].abs()) + + # Sine/Cosine for Day of Year (Seasonality cyclic mapping) + # The longest twilight is around solstice, shortest around equinox, + # so we use a 1-year frequency harmonic. + df['doy_sin'] = np.sin(2 * np.pi * df['day_of_year'] / 365.25) + df['doy_cos'] = np.cos(2 * np.pi * df['day_of_year'] / 365.25) + + # Interaction + df['lat_x_season'] = df['lat_abs'] * df['doy_cos'] + + features = ['lat_abs', 'lat_sq', 'elev_sqrt', 'doy_sin', 'doy_cos', 'lat_x_season'] + X = df[features] + y = df[target] + + # 1. Standardize (just to see which features matter most) + scaler = StandardScaler() + X_scaled = scaler.fit_transform(X) + + # Use Lasso to zero-out useless variables (Sparsity for a "clean formula") + lasso = LassoCV(cv=5) + lasso.fit(X_scaled, y) + + # Filter features that survived Lasso + surviving_idx = [i for i, coef in enumerate(lasso.coef_) if abs(coef) > 0.001] + if not surviving_idx: # Fallback if everything zeroed out + surviving_idx = [0, 1] + + surviving_features = [features[i] for i in surviving_idx] + + # 2. Refit Linear Regression on strictly the surviving features natively (unscaled) for exact formula constants! + X_clean = df[surviving_features] + lr = LinearRegression() + lr.fit(X_clean, y) + + y_pred = lr.predict(X_clean) + r2 = r2_score(y, y_pred) + mae = mean_absolute_error(y, y_pred) + + print(f"Algorithm Selected: Clean Sparse Polynomial") + print(f"Targeting: Latitude, Seasonality Harmonics, Elevation") + print(f"Mathematical Accuracy: MAE = {mae:.3f} degrees | R^2 = {r2:.4f}") + + print("\n--- THE PERFECT ALGEBRAIC FORMULA ---") + + formula = f"D0_base = {lr.intercept_:.4f}" + for feat, coef in zip(surviving_features, lr.coef_): + sign = "+" if coef >= 0 else "-" + formula += f"\n {sign} {abs(coef):.6f} * {feat}" + + print(formula) + + print("\n--- COPY-PASTE PYTHON IMPLEMENTATION ---") + print(f"def get_{prayer_name}_angle(lat, day_of_year, elevation_m):") + print(f" import numpy as np") + print(f" import math") + print(f" lat_abs = abs(lat)") + if 'lat_sq' in surviving_features: + print(f" lat_sq = lat ** 2") + if 'elev_sqrt' in surviving_features: + print(f" elev_sqrt = math.sqrt(max(0, elevation_m))") + if 'doy_sin' in surviving_features: + print(f" doy_sin = math.sin(2 * math.pi * day_of_year / 365.25)") + if 'doy_cos' in surviving_features: + print(f" doy_cos = math.cos(2 * math.pi * day_of_year / 365.25)") + if 'lat_x_season' in surviving_features: + print(f" lat_x_season = lat_abs * doy_cos") + + eq = f" return {lr.intercept_:.4f}" + for feat, coef in zip(surviving_features, lr.coef_): + sign = "+" if coef >= 0 else "-" + eq += f" {sign} ({abs(coef):.6f} * {feat})" + print(eq) + print("="*60) + +if __name__ == '__main__': + find_perfect_formula("fajr", "data/processed/fajr_angles.csv") + find_perfect_formula("isha", "data/processed/isha_angles.csv") diff --git a/src/analyze/ml_analysis.py b/src/analyze/ml_analysis.py new file mode 100644 index 0000000..a6e8ef0 --- /dev/null +++ b/src/analyze/ml_analysis.py @@ -0,0 +1,73 @@ +import pandas as pd +import numpy as np +from sklearn.linear_model import LinearRegression +from sklearn.preprocessing import PolynomialFeatures +from sklearn.pipeline import make_pipeline +from sklearn.metrics import r2_score, mean_absolute_error +from sklearn.ensemble import RandomForestRegressor +import warnings +warnings.filterwarnings('ignore') + +def analyze_prayer(prayer_name, csv_file): + print(f"\n{'='*50}") + print(f" ML Analysis: {prayer_name.upper()}") + print(f"{'='*50}") + + df = pd.read_csv(csv_file) + target = f"{prayer_name}_angle" + + # Drop NaNs + df = df.dropna(subset=['lat', 'day_of_year', target]) + + X = df[['lat', 'day_of_year']] + y = df[target] + + print(f"Total Rows: {len(df)}") + + # 1. Standard Linear Regression + lr = LinearRegression() + lr.fit(X, y) + lr_pred = lr.predict(X) + lr_r2 = r2_score(y, lr_pred) + lr_mae = mean_absolute_error(y, lr_pred) + + print("\n--- Model 1: Linear Regression ---") + print(f"R2: {lr_r2:.4f} | MAE: {lr_mae:.4f} degrees") + print(f"Formula: Angle = {lr.intercept_:.4f} + ({lr.coef_[0]:.4f} * lat) + ({lr.coef_[1]:.4f} * day_of_year)") + + # 2. Polynomial Regression (Degree 2 - captures exponential/quadratic non-linear relationships) + poly = make_pipeline(PolynomialFeatures(degree=2), LinearRegression()) + poly.fit(X, y) + poly_pred = poly.predict(X) + poly_r2 = r2_score(y, poly_pred) + poly_mae = mean_absolute_error(y, poly_pred) + + print("\n--- Model 2: Polynomial (Non-Linear) Regression (Degree 2) ---") + print(f"R2: {poly_r2:.4f} | MAE: {poly_mae:.4f} degrees") + + # Check if non-linear significantly outperforms linear + if poly_r2 > lr_r2 + 0.05: + print("-> CONCLUSION: The relationship is strongly non-linear/exponential.") + else: + print("-> CONCLUSION: The relationship is predominantly linear, with minimal exponential curvature.") + + # 3. Random Forest (Highly Non-Linear Trees) + rf = RandomForestRegressor(n_estimators=50, max_depth=5, random_state=42) + rf.fit(X, y) + rf_pred = rf.predict(X) + rf_r2 = r2_score(y, rf_pred) + rf_mae = mean_absolute_error(y, rf_pred) + + print("\n--- Model 3: Random Forest (Non-linear Tree Ensembles) ---") + print(f"R2: {rf_r2:.4f} | MAE: {rf_mae:.4f} degrees") + + # DPC/Moonsighting baseline simulated evaluation + # Moonsighting historically uses dynamic formulas incorporating Latitude and Season (TOY) + print("\n--- Feature Importance (Random Forest) ---") + importance = rf.feature_importances_ + print(f"Latitude drives {importance[0]*100:.1f}% of the variance.") + print(f"Day of Year (Season) drives {importance[1]*100:.1f}% of the variance.") + +if __name__ == '__main__': + analyze_prayer("fajr", "data/processed/fajr_angles.csv") + analyze_prayer("isha", "data/processed/isha_angles.csv") diff --git a/src/analyze/ml_correlations.py b/src/analyze/ml_correlations.py new file mode 100644 index 0000000..f18544a --- /dev/null +++ b/src/analyze/ml_correlations.py @@ -0,0 +1,74 @@ +import pandas as pd +import numpy as np +import os +import sys + +def evaluate_models(df, prayer): + print(f"\n{'='*50}\nML Feature Discovery: {prayer.upper()} ({len(df)} records)\n{'='*50}") + + if len(df) == 0: + return + + df['sin_toy'] = np.sin(2 * np.pi * df['day_of_year'] / 365.25) + df['cos_toy'] = np.cos(2 * np.pi * df['day_of_year'] / 365.25) + + # 1. Correlation Matrix + cols = ['lat', 'elevation_m', 'day_of_year', 'sin_toy', 'cos_toy', f'{prayer}_angle'] + corr = df[cols].corr() + print("\n--- Pearson Correlation Matrix ---") + print(corr[f'{prayer}_angle'].sort_values(ascending=False).to_string()) + + # 2. Linear vs Polynomial (Exponential/Curved) Evaluation + # Testing Latitude relationship + lat_val = df['lat'].values + angle_val = df[f'{prayer}_angle'].values + + # Linear Fit: y = ax + b + p_lin = np.polyfit(lat_val, angle_val, 1) + y_lin = np.polyval(p_lin, lat_val) + ss_res_lin = np.sum((angle_val - y_lin) ** 2) + ss_tot = np.sum((angle_val - np.mean(angle_val)) ** 2) + r2_lin = 1 - (ss_res_lin / ss_tot) + + # Polynomial Fit (Degree 3): y = ax^3 + bx^2 + cx + d + p_poly = np.polyfit(lat_val, angle_val, 3) + y_poly = np.polyval(p_poly, lat_val) + ss_res_poly = np.sum((angle_val - y_poly) ** 2) + r2_poly = 1 - (ss_res_poly / ss_tot) + + print("\n--- Linear vs Exponential (Polynomial) Latitude Dependency ---") + print(f"Linear Fit R²: {r2_lin:.4f}") + print(f"Polynomial Fit R²: {r2_poly:.4f}") + if r2_poly > r2_lin + 0.05: + print(">> OBSERVATION: The relationship with Latitude is highly EXPONENTIAL/NON-LINEAR.") + else: + print(">> OBSERVATION: The relationship with Latitude is mostly LINEAR or very weak overall.") + + # 3. Monthly / Seasonal Aggregation + df['month'] = pd.to_datetime(df['date']).dt.month + monthly_mean = df.groupby('month')[f'{prayer}_angle'].mean() + print("\n--- Seasonal (Time of Year) Impact ---") + print("Mean Angle by Month:") + print(monthly_mean.to_string()) + season_variance = monthly_mean.max() - monthly_mean.min() + print(f">> OBSERVATION: Seasonal fluctuation variance is {season_variance:.2f}°") + +def main(): + base_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + fajr_path = os.path.join(base_dir, "data", "processed", "fajr_angles.csv") + isha_path = os.path.join(base_dir, "data", "processed", "isha_angles.csv") + + try: + fajr_df = pd.read_csv(fajr_path) + evaluate_models(fajr_df, "fajr") + except Exception as e: + print(f"Fajr parse error: {e}") + + try: + isha_df = pd.read_csv(isha_path) + evaluate_models(isha_df, "isha") + except Exception as e: + print(f"Isha parse error: {e}") + +if __name__ == "__main__": + main() diff --git a/src/analyze/ml_discover_formula.py b/src/analyze/ml_discover_formula.py new file mode 100644 index 0000000..97ce292 --- /dev/null +++ b/src/analyze/ml_discover_formula.py @@ -0,0 +1,138 @@ +import pandas as pd +import numpy as np +from scipy.optimize import curve_fit +import os + +# --------------------------------------------------------- +# Candidate Biological/Astronomical Formula Definitions +# x is a 2D array: x[0] = abs(lat), x[1] = elevation, x[2] = doy +# --------------------------------------------------------- + +def eq1_static(x, a): + return np.full_like(x[0], a) + +def eq2_lin_lat(x, a, b): + return a + b * x[0] + +def eq3_quad_lat(x, a, b, c): + return a + b * x[0] + c * (x[0]**2) + +def eq4_lat_season(x, a, b, c, d): + # c is amplitude, d is phase shift + return a + b * x[0] + c * np.cos(2 * np.pi * (x[2] - d) / 365.25) + +def eq5_elev_lat_season(x, a, b, c, d, e): + return a + b * x[0] + c * x[1] + d * np.cos(2 * np.pi * (x[2] - e) / 365.25) + +def eq6_lat_dependent_season(x, a, b, c, d): + # Amplitude of season scales with latitude + return a + b * x[0] + c * x[0] * np.cos(2 * np.pi * (x[2] - d) / 365.25) + +def eq7_the_beautiful_match(x, a, b, c, d, e, f): + # The ultimate clean parametric equation + # Base + A*Lat + B*Lat^2 + C*Elev + D*Lat*Season + E*Season + lat = x[0] + elev = x[1] + season = np.cos(2 * np.pi * (x[2] - f) / 365.25) + return a + b * lat + c * (lat**2) + d * elev + e * lat * season + +# Map functions to their string representations for readable output +FORMULAS = [ + ("Static Baseline", eq1_static, ["Base"]), + ("Linear Latitude", eq2_lin_lat, ["Base", "Lat_Coeff"]), + ("Quadratic Latitude", eq3_quad_lat, ["Base", "Lat_Coeff", "Lat2_Coeff"]), + ("Latitude + Season", eq4_lat_season, ["Base", "Lat_Coeff", "Season_Amp", "Phase_Shift"]), + ("Elevation + Lat + Season", eq5_elev_lat_season, ["Base", "Lat_Coeff", "Elev_Coeff", "Season_Amp", "Phase_Shift"]), + ("Lat-Dependent Season", eq6_lat_dependent_season, ["Base", "Lat_Coeff", "Season_Lat_Scale", "Phase_Shift"]), + ("The Beautiful Match", eq7_the_beautiful_match, ["Base", "Lat_C", "Lat2_C", "Elev_C", "LatXSeason_C", "Phase_Shift"]) +] + +def analyze_formula(df, prayer): + print(f"\n{'='*70}\n[ {prayer.upper()} ALGORITHM DISCOVERY ] - {len(df)} Records\n{'='*70}") + + # Prepare input vectors + lat_arr = np.abs(df['lat'].values) + elev_arr = df['elevation_m'].values + doy_arr = df['day_of_year'].values + y = df[f'{prayer}_angle'].values + + x = np.vstack((lat_arr, elev_arr, doy_arr)) + + best_mae = float('inf') + best_name = None + best_params = None + best_popt = None + best_func = None + + for name, func, param_names in FORMULAS: + # Initial guess (15.0 for base, 0 for others) + p0 = [15.0] + [0.0] * (len(param_names)-1) + # Bounding Phase_Shift safely if it exists (usually the last param if 'Phase_Shift' is there) + bounds = (-np.inf, np.inf) + + try: + popt, pcov = curve_fit(func, x, y, p0=p0, maxfev=10000) + + y_pred = func(x, *popt) + mae = np.mean(np.abs(y - y_pred)) + rmse = np.sqrt(np.mean((y - y_pred)**2)) + + print(f"--- {name} ---") + print(f"MAE: {mae:.4f}° | RMSE: {rmse:.4f}°") + params_str = ", ".join([f"{n}={v:.5f}" for n, v in zip(param_names, popt)]) + print(f"Parameters: {params_str}") + print() + + if mae < best_mae: + best_mae = mae + best_name = name + best_params = params_str + best_popt = popt + best_func = func + + except Exception as e: + print(f"Failed to fit {name}: {e}") + + print(f"\n>> 🏆 WINNING FORMULA FOR {prayer.upper()}: {best_name}") + print(f">> BEST MAE: {best_mae:.4f}°") + print(f">> CONSTANTS: {best_params}") + + # Construct the human-readable clean mathematical equation + if best_name == "The Beautiful Match": + a, b, c, d, e, f = best_popt + eq_str = ( + f"Angle = {a:.4f} " + f"{'+' if b>=0 else '-'} {abs(b):.5f} * abs(Lat) " + f"{'+' if c>=0 else '-'} {abs(c):.5f} * Lat² " + f"{'+' if d>=0 else '-'} {abs(d):.5f} * Elevation " + f"{'+' if e>=0 else '-'} {abs(e):.5f} * abs(Lat) * cos(2π * (DayOfYear - {f:.1f}) / 365)" + ) + print(f"\n✨ THE CLEAN FORMULA ✨\n{eq_str}\n") + elif best_name == "Lat-Dependent Season": + a, b, c, d = best_popt + eq_str = ( + f"Angle = {a:.4f} " + f"{'+' if b>=0 else '-'} {abs(b):.5f} * abs(Lat) " + f"{'+' if c>=0 else '-'} {abs(c):.5f} * abs(Lat) * cos(2π * (DayOfYear - {d:.1f}) / 365)" + ) + print(f"\n✨ THE CLEAN FORMULA ✨\n{eq_str}\n") + +def main(): + base_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + fajr_path = os.path.join(base_dir, "data", "processed", "fajr_angles.csv") + isha_path = os.path.join(base_dir, "data", "processed", "isha_angles.csv") + + # 1. Clean outliers aggressively to find the mathematically purest underlying signal + # We will filter out angles < 10 and > 22 for absolute pure signal matching + if os.path.exists(fajr_path): + df = pd.read_csv(fajr_path) + df_clean = df[(df['fajr_angle'] >= 10.0) & (df['fajr_angle'] <= 20.0)] + analyze_formula(df_clean, "fajr") + + if os.path.exists(isha_path): + df = pd.read_csv(isha_path) + df_clean = df[(df['isha_angle'] >= 11.0) & (df['isha_angle'] <= 21.0)] + analyze_formula(df_clean, "isha") + +if __name__ == "__main__": + main() diff --git a/src/analyze/praytimes.py b/src/analyze/praytimes.py new file mode 100644 index 0000000..1becba2 --- /dev/null +++ b/src/analyze/praytimes.py @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/src/analyze/run_ml_comparison.py b/src/analyze/run_ml_comparison.py new file mode 100644 index 0000000..03ef449 --- /dev/null +++ b/src/analyze/run_ml_comparison.py @@ -0,0 +1,102 @@ +import pandas as pd +import numpy as np +import warnings +from sklearn.linear_model import LinearRegression +from sklearn.metrics import mean_absolute_error, r2_score +from scipy.optimize import curve_fit + +warnings.filterwarnings("ignore") + +def exp_func(x, a, b, c): + return a * np.exp(b * x) + c + +def eval_algorithms(df, name): + print(f"\n==========================================") + print(f" {name.upper()} ML PATTERN ANALYSIS ") + print(f"==========================================") + print(f"Total verified observations: {len(df)}") + + # Check column name for angle + angle_col = f"{name.lower()}_angle" + if angle_col not in df.columns: + print(f"Error: {angle_col} not found in columns") + return + + df['doy'] = df['day_of_year'] + df['lat_abs'] = df['lat'].abs() + + # Missing value cleaning just in case + df = df.dropna(subset=[angle_col, 'doy', 'lat_abs']) + + # 1. Seasonality Feature Engineering (Sin/Cos mapping of Time of Year) + df['doy_sin'] = np.sin(2 * np.pi * df['doy'] / 365) + df['doy_cos'] = np.cos(2 * np.pi * df['doy'] / 365) + + X_lat = df[['lat_abs']].values + X_all = df[['lat_abs', 'doy_sin', 'doy_cos']].values + y = df[angle_col].values + + # --- A. BASELINE FIXED ALGORITHM (ISNA/MWL style: 15° or 18°) --- + print("\n[A] STATIC ALGORITHM BENCHMARKS (15° vs 18°)") + mae_15 = mean_absolute_error(y, np.full_like(y, 15.0)) + mae_18 = mean_absolute_error(y, np.full_like(y, 18.0)) + print(f" Fixed 15° Error (MAE): {mae_15:.3f}°") + print(f" Fixed 18° Error (MAE): {mae_18:.3f}°") + + # --- B. LATITUDE ONLY (Linear vs Exponential) --- + print("\n[B] LATITUDE PATTERNS (Equator distance vs Angle)") + + # Linear Latitude + lr_lat = LinearRegression().fit(X_lat, y) + y_pred_lat_lin = lr_lat.predict(X_lat) + print(f" Linear Latitude Fit : R2={r2_score(y, y_pred_lat_lin):.3f}, MAE={mean_absolute_error(y, y_pred_lat_lin):.3f}°") + print(f" -> Coefficient: {lr_lat.coef_[0]:.4f} degrees per 1° latitude move") + + # Exponential Latitude + try: + # We model the depression angle dropping exponentially toward the poles + popt, _ = curve_fit(exp_func, df['lat_abs'], y, p0=[-1, 0.05, 18], maxfev=5000) + y_pred_lat_exp = exp_func(df['lat_abs'], *popt) + r2_exp = r2_score(y, y_pred_lat_exp) + mae_exp = mean_absolute_error(y, y_pred_lat_exp) + print(f" Exponential Latitude Fit : R2={r2_exp:.3f}, MAE={mae_exp:.3f}°") + except Exception as e: + print(f" Exponential Fit failed: {e}") + + # --- C. TIME OF YEAR (SEASONALITY) IMPACT --- + print("\n[C] TIME OF YEAR (SEASONAL) CORRELATIONS") + lr_all = LinearRegression().fit(X_all, y) + y_pred_all = lr_all.predict(X_all) + print(f" Combined Model (Lat + Seasonality): R2={r2_score(y, y_pred_all):.3f}, MAE={mean_absolute_error(y, y_pred_all):.3f}°") + + # Feature Importance + print(f" -> Lat Importance : {abs(lr_all.coef_[0]):.4f}") + print(f" -> TOY (Season) Sine: {abs(lr_all.coef_[1]):.4f} amplitude") + + # --- D. DYNAMIC ALGORITHM COMPARISON --- + high_lat_mask = df['lat_abs'] > 40 + if sum(high_lat_mask) > 0: + high_lat_mean = df[high_lat_mask][angle_col].mean() + low_lat_mean = df[~high_lat_mask][angle_col].mean() + print(f"\n[D] ALGORITHMIC COMPARISON") + print(f" Mean empirical angle at >40° Lat : {high_lat_mean:.2f}°") + print(f" Mean empirical angle at <=40° Lat : {low_lat_mean:.2f}°") + shift = low_lat_mean - high_lat_mean + print(f" Shift detected: {shift:.2f}° absolute reduction when moving towards poles.") + if shift > 0.5: + print(" => Matches Moonsighting/DPC dynamic logic (angles drop significantly at higher latitudes).") + else: + print(" => Does NOT strongly match Moonsighting dynamic logic.") + +if __name__ == '__main__': + try: + fajr_df = pd.read_csv('data/processed/fajr_angles.csv') + eval_algorithms(fajr_df, 'Fajr') + except Exception as e: + print("Missing Fajr dataset:", e) + + try: + isha_df = pd.read_csv('data/processed/isha_angles.csv') + eval_algorithms(isha_df, 'Isha') + except Exception as e: + print("Missing Isha dataset:", e) diff --git a/src/bsrn_download.py b/src/bsrn_download.py new file mode 100644 index 0000000..c1d71bc --- /dev/null +++ b/src/bsrn_download.py @@ -0,0 +1,671 @@ +""" +BSRN Data Downloader and Twilight Extractor + +Downloads BSRN irradiance data from: +1. Zenodo 5-site dataset (quick, 58.5 MB parquet) +2. CAELUS Zenodo dataset (54 stations, prioritizing 20-45°N) + +For each station-year, extracts twilight events: +- Dawn: moment GHI first exceeds threshold after midnight +- Dusk: last moment GHI exceeds threshold before midnight + +Computes solar depression angle at each event. +Outputs CSV rows suitable for pray-calc fajr/isha angle analysis. +""" + +import os +import sys +import math +import zipfile +import requests +import pandas as pd +import numpy as np +from datetime import datetime, timezone, date, timedelta +from pathlib import Path +from io import BytesIO + +# ── paths ──────────────────────────────────────────────────────────────────── +BASE = Path("/Volumes/X9/Sites/acamarata/pray-calc-ml") +RAW_BSRN = BASE / "data/raw/bsrn" +RAW_SIGHTINGS = BASE / "data/raw/raw_sightings" +RAW_BSRN.mkdir(parents=True, exist_ok=True) +RAW_SIGHTINGS.mkdir(parents=True, exist_ok=True) + +# ── thresholds ──────────────────────────────────────────────────────────────── +# GHI threshold in W/m² to call "detectable irradiance" +# 0.5 W/m² ~ starlight. 1 W/m² is typical first-light threshold. +# We extract events at multiple thresholds. +THRESHOLDS = [0.5, 1.0, 2.0, 5.0, 10.0] +PRIMARY_THRESHOLD = 1.0 # used for the main sightings CSV + +# ── station priority list (CAELUS dataset) ──────────────────────────────────── +# Prioritize stations in 20-45°N, especially Muslim-majority/relevant regions +PRIORITY_STATIONS = { + "sbo": {"name": "Sede Boqer", "lat": 30.8597, "lon": 34.7794, "country": "Israel/Negev"}, + "tam": {"name": "Tamanrasset", "lat": 22.7903, "lon": 5.5292, "country": "Algeria"}, + "xia": {"name": "Xianghe", "lat": 39.754, "lon": 116.962, "country": "China"}, + "gur": {"name": "Gurgaon", "lat": 28.4249, "lon": 77.156, "country": "India"}, + "iza": {"name": "Izana", "lat": 28.3093, "lon": -16.4993, "country": "Spain/Canary"}, + "ish": {"name": "Ishigakijima", "lat": 24.3367, "lon": 124.1644, "country": "Japan"}, + "lln": {"name": "Lulin", "lat": 23.4686, "lon": 120.8736, "country": "Taiwan"}, + "dra": {"name": "Desert Rock", "lat": 36.626, "lon": -116.018, "country": "USA/Nevada"}, + "e13": {"name": "SouthernGreatPlains","lat": 36.605, "lon": -97.485, "country": "USA/Oklahoma"}, + "tat": {"name": "Tateno", "lat": 36.0581, "lon": 140.1258, "country": "Japan"}, + "fua": {"name": "Fukuoka", "lat": 33.5822, "lon": 130.3764, "country": "Japan"}, + "gcr": {"name": "GoodwinCreek", "lat": 34.2547, "lon": -89.8729, "country": "USA/Mississippi"}, + "bil": {"name": "Billings", "lat": 36.605, "lon": -97.516, "country": "USA/Oklahoma"}, +} + +# Years available in CAELUS per station (approximate; we'll try 2007-2017) +CAELUS_YEAR_RANGE = list(range(2007, 2018)) + +# ── solar geometry ──────────────────────────────────────────────────────────── + +def solar_declination(day_of_year: int) -> float: + """Solar declination in degrees (Spencer 1971).""" + B = 2 * math.pi * (day_of_year - 1) / 365 + return (180 / math.pi) * ( + 0.006918 + - 0.399912 * math.cos(B) + + 0.070257 * math.sin(B) + - 0.006758 * math.cos(2 * B) + + 0.000907 * math.sin(2 * B) + - 0.002697 * math.cos(3 * B) + + 0.00148 * math.sin(3 * B) + ) + + +def equation_of_time_minutes(day_of_year: int) -> float: + """Equation of time in minutes (Spencer 1971).""" + B = 2 * math.pi * (day_of_year - 1) / 365 + return (180 / math.pi) * ( + 0.000075 + + 0.001868 * math.cos(B) + - 0.032077 * math.sin(B) + - 0.014615 * math.cos(2 * B) + - 0.04089 * math.sin(2 * B) + ) * (180 / 15) # convert to minutes + + +def solar_hour_angle(utc_hour_frac: float, lon_deg: float, day_of_year: int) -> float: + """Solar hour angle in degrees at given UTC fractional hour and longitude.""" + eot = equation_of_time_minutes(day_of_year) + # Local solar time in hours + lst = utc_hour_frac + lon_deg / 15.0 + eot / 60.0 + # Hour angle (0 at solar noon) + return 15.0 * (lst - 12.0) + + +def solar_elevation(lat_deg: float, lon_deg: float, utc_dt: datetime) -> float: + """Solar elevation angle in degrees (positive above horizon).""" + doy = utc_dt.timetuple().tm_yday + utc_frac = utc_dt.hour + utc_dt.minute / 60.0 + utc_dt.second / 3600.0 + decl = math.radians(solar_declination(doy)) + lat = math.radians(lat_deg) + ha = math.radians(solar_hour_angle(utc_frac, lon_deg, doy)) + sin_elev = ( + math.sin(lat) * math.sin(decl) + + math.cos(lat) * math.cos(decl) * math.cos(ha) + ) + sin_elev = max(-1.0, min(1.0, sin_elev)) + return math.degrees(math.asin(sin_elev)) + + +def solar_depression_angle(lat_deg: float, lon_deg: float, utc_dt: datetime) -> float: + """Solar depression angle = negative elevation (positive below horizon).""" + return -solar_elevation(lat_deg, lon_deg, utc_dt) + + +# ── download helpers ────────────────────────────────────────────────────────── + +def download_file(url: str, dest: Path, label: str = "") -> bool: + """Download a file with progress. Returns True on success.""" + if dest.exists() and dest.stat().st_size > 1000: + print(f" [skip] {label or dest.name} already downloaded") + return True + try: + print(f" [download] {label or dest.name} ...", end=" ", flush=True) + r = requests.get(url, timeout=120, stream=True) + r.raise_for_status() + total = int(r.headers.get("content-length", 0)) + downloaded = 0 + with open(dest, "wb") as f: + for chunk in r.iter_content(chunk_size=65536): + f.write(chunk) + downloaded += len(chunk) + mb = downloaded / 1_048_576 + print(f"OK ({mb:.1f} MB)") + return True + except Exception as e: + print(f"FAILED: {e}") + if dest.exists(): + dest.unlink() + return False + + +# ── twilight extraction ─────────────────────────────────────────────────────── + +def extract_twilight_events( + df: pd.DataFrame, + lat: float, + lon: float, + station_code: str, + station_name: str, + source: str, + threshold: float = PRIMARY_THRESHOLD, +) -> list[dict]: + """ + Given a dataframe with columns [timestamp, ghi] (UTC timestamps), + extract per-day dawn and dusk events. + + Dawn: first minute where GHI >= threshold after a run of zeros + Dusk: last minute where GHI >= threshold before a run of zeros + + Returns a list of dicts with keys: + date, event_type, utc_time, ghi_wm2, solar_depression_deg, + lat, lon, station_code, station_name, source, threshold_wm2 + """ + events = [] + + # Ensure UTC + if df["timestamp"].dt.tz is None: + df = df.copy() + df["timestamp"] = df["timestamp"].dt.tz_localize("UTC") + else: + df = df.copy() + df["timestamp"] = df["timestamp"].dt.tz_convert("UTC") + + # Sort and drop nulls + df = df.dropna(subset=["ghi"]).sort_values("timestamp").reset_index(drop=True) + df["ghi"] = pd.to_numeric(df["ghi"], errors="coerce") + df = df.dropna(subset=["ghi"]) + + # Group by UTC date (using the date of the timestamp) + df["date"] = df["timestamp"].dt.date + + for day, group in df.groupby("date"): + group = group.sort_values("timestamp").reset_index(drop=True) + ghi = group["ghi"].values + ts = group["timestamp"].values # numpy datetime64 + + # Find dawn: first index where GHI crosses threshold from below + # We require at least 3 consecutive sub-threshold readings before the crossing + dawn_idx = None + for i in range(3, len(ghi)): + if ghi[i] >= threshold and all(g < threshold for g in ghi[max(0, i-3):i]): + dawn_idx = i + break + + # Find dusk: last index where GHI is >= threshold followed by sub-threshold + dusk_idx = None + for i in range(len(ghi) - 4, -1, -1): + if ghi[i] >= threshold and all(g < threshold for g in ghi[i+1:min(len(ghi), i+4)]): + dusk_idx = i + break + + for idx, event_type in [(dawn_idx, "dawn"), (dusk_idx, "dusk")]: + if idx is None: + continue + ts_val = pd.Timestamp(ts[idx]) + utc_dt = ts_val.to_pydatetime().replace(tzinfo=timezone.utc) + dep = solar_depression_angle(lat, lon, utc_dt) + ghi_val = float(ghi[idx]) + + # Only record events where sun is clearly below horizon + # (eliminates cloudy days where GHI is suppressed during daytime) + if dep < 0: # sun above horizon — skip, not a real twilight event + continue + if dep > 25: # sun more than 25° below — likely data noise + continue + + events.append({ + "date": str(day), + "event_type": event_type, + "utc_time": utc_dt.strftime("%H:%M"), + "ghi_wm2": round(ghi_val, 2), + "solar_depression_deg": round(dep, 4), + "lat": lat, + "lon": lon, + "station_code": station_code, + "station_name": station_name, + "source": source, + "threshold_wm2": threshold, + }) + + return events + + +# ── dataset 1: BSRN 5-site parquet ─────────────────────────────────────────── + +FIVE_SITE_URL = "https://zenodo.org/records/10593079/files/bsrn_valid_five_sites.parquet" +FIVE_SITE_PATH = RAW_BSRN / "bsrn_valid_five_sites.parquet" + +FIVE_SITE_META = { + "mnm": {"name": "Minamitorishima", "lat": 24.2833, "lon": 153.9667}, + "asp": {"name": "Alice Springs", "lat": -23.7984, "lon": 133.8880}, + "car": {"name": "Carpentras", "lat": 44.0833, "lon": 5.0583}, + "bon": {"name": "Bondville", "lat": 40.0667, "lon": -88.3667}, + "son": {"name": "Sonnblick", "lat": 47.0542, "lon": 12.9578}, +} + + +def process_five_site() -> list[dict]: + """Download and process the BSRN 5-site parquet dataset.""" + print("\n=== BSRN 5-site dataset (Zenodo 10593079) ===") + ok = download_file(FIVE_SITE_URL, FIVE_SITE_PATH, "bsrn_valid_five_sites.parquet") + if not ok: + print(" ERROR: could not download 5-site dataset") + return [] + + print(" Loading parquet ...", end=" ", flush=True) + df_all = pd.read_parquet(FIVE_SITE_PATH) + print(f"OK — {len(df_all):,} rows, columns: {list(df_all.columns)}") + + all_events = [] + + # Detect timestamp column + ts_col = None + for c in ["timestamp", "time", "datetime", "date_time", "Timestamp", "TIME"]: + if c in df_all.columns: + ts_col = c + break + if ts_col is None: + # Use first datetime-like column + for c in df_all.columns: + if pd.api.types.is_datetime64_any_dtype(df_all[c]): + ts_col = c + break + if ts_col is None: + print(f" WARNING: cannot find timestamp column. Columns: {list(df_all.columns)[:20]}") + return [] + + # Detect GHI column + ghi_col = None + for c in ["ghi", "GHI", "global_horizontal_irradiance", "SWD", "swd", "global"]: + if c in df_all.columns: + ghi_col = c + break + if ghi_col is None: + print(f" WARNING: cannot find GHI column. Columns: {list(df_all.columns)[:20]}") + return [] + + # Detect station column + stn_col = None + for c in ["station", "sta", "site", "STATION"]: + if c in df_all.columns: + stn_col = c + break + + print(f" Using ts_col='{ts_col}', ghi_col='{ghi_col}', station_col='{stn_col}'") + + if stn_col: + groups = {stn: sub for stn, sub in df_all.groupby(stn_col)} + else: + # Single station file — infer from filename or use first meta entry + groups = {"unknown": df_all} + + for stn_code, sub in groups.items(): + stn_key = str(stn_code).lower() + meta = FIVE_SITE_META.get(stn_key, None) + if meta is None: + # Try to find lat/lon in data + if "latitude" in sub.columns and "longitude" in sub.columns: + lat = float(sub["latitude"].iloc[0]) + lon = float(sub["longitude"].iloc[0]) + name = stn_key + else: + print(f" SKIP {stn_key}: no metadata") + continue + else: + lat, lon, name = meta["lat"], meta["lon"], meta["name"] + + work = sub[[ts_col, ghi_col]].copy() + work.columns = ["timestamp", "ghi"] + work["timestamp"] = pd.to_datetime(work["timestamp"], utc=True, errors="coerce") + work = work.dropna(subset=["timestamp"]) + + print(f" Processing {name} ({stn_key}): {len(work):,} rows ...", end=" ", flush=True) + evts = extract_twilight_events(work, lat, lon, stn_key, name, "bsrn_5site") + print(f"{len(evts)} events") + all_events.extend(evts) + + return all_events + + +# ── dataset 2: CAELUS ZIP files ─────────────────────────────────────────────── + +CAELUS_BASE_URL = "https://zenodo.org/api/records/7897639/files/{filename}/content" + + +def caelus_filename(station_code: str, year: int) -> str: + return f"{station_code}_bsrn_{year}.zip" + + +def list_caelus_available() -> dict[str, list[int]]: + """ + Probe Zenodo API to get available files, return dict of station -> [years]. + """ + print("\n Querying CAELUS file list from Zenodo API ...", end=" ", flush=True) + try: + r = requests.get("https://zenodo.org/api/records/7897639", timeout=30) + r.raise_for_status() + data = r.json() + available: dict[str, list[int]] = {} + for f in data.get("files", []): + fname = f.get("key", "") + if fname.endswith(".zip") and "_bsrn_" in fname: + parts = fname.replace(".zip", "").split("_bsrn_") + if len(parts) == 2: + code = parts[0] + try: + year = int(parts[1]) + available.setdefault(code, []).append(year) + except ValueError: + pass + print(f"found {len(available)} stations") + return available + except Exception as e: + print(f"FAILED: {e}") + return {} + + +def process_caelus_zip(zip_path: Path, station_code: str, meta: dict, year: int) -> list[dict]: + """Extract parquet from ZIP, parse, return events.""" + events = [] + try: + with zipfile.ZipFile(zip_path, "r") as zf: + parquet_files = [n for n in zf.namelist() if n.endswith(".parquet")] + if not parquet_files: + print(f" WARNING: no parquet in {zip_path.name}") + return [] + for pf in parquet_files: + with zf.open(pf) as f: + df = pd.read_parquet(BytesIO(f.read())) + + # Detect columns + ts_col = None + for c in ["timestamp", "time", "datetime", "Timestamp", "TIME"]: + if c in df.columns: + ts_col = c + break + if ts_col is None: + for c in df.columns: + if pd.api.types.is_datetime64_any_dtype(df[c]): + ts_col = c + break + if ts_col is None: + print(f" WARNING: no timestamp in {pf}") + continue + + ghi_col = None + for c in ["ghi", "GHI", "SWD", "swd", "global_horizontal_irradiance", "global"]: + if c in df.columns: + ghi_col = c + break + if ghi_col is None: + # Use diffuse if available + for c in ["dhi", "DHI", "diffuse"]: + if c in df.columns: + ghi_col = c + break + if ghi_col is None: + print(f" WARNING: no GHI column in {pf}. cols={list(df.columns)[:15]}") + continue + + work = df[[ts_col, ghi_col]].copy() + work.columns = ["timestamp", "ghi"] + work["timestamp"] = pd.to_datetime(work["timestamp"], utc=True, errors="coerce") + work = work.dropna(subset=["timestamp"]) + + evts = extract_twilight_events( + work, + meta["lat"], meta["lon"], + station_code, meta["name"], + f"caelus_{year}", + ) + events.extend(evts) + + except zipfile.BadZipFile as e: + print(f" ERROR: bad zip {zip_path.name}: {e}") + except Exception as e: + print(f" ERROR processing {zip_path.name}: {e}") + + return events + + +def process_caelus(max_stations: int = 8, max_years_per_station: int = 5) -> list[dict]: + """Download and process CAELUS priority stations.""" + print("\n=== CAELUS dataset (Zenodo 7897639) ===") + available = list_caelus_available() + + all_events = [] + + processed_count = 0 + for code, meta in PRIORITY_STATIONS.items(): + if processed_count >= max_stations: + break + + if code not in available: + print(f" SKIP {code} ({meta['name']}): not in CAELUS dataset") + continue + + years = sorted(available[code]) + years_to_use = years[:max_years_per_station] + print(f"\n Station: {meta['name']} ({code}) | lat={meta['lat']} lon={meta['lon']} | years: {years_to_use}") + + station_events = [] + for year in years_to_use: + fname = caelus_filename(code, year) + url = CAELUS_BASE_URL.format(filename=fname) + dest = RAW_BSRN / fname + + ok = download_file(url, dest, fname) + if not ok: + continue + + print(f" Processing {fname} ...", end=" ", flush=True) + evts = process_caelus_zip(dest, code, meta, year) + print(f"{len(evts)} events") + station_events.extend(evts) + + print(f" Total events for {code}: {len(station_events)}") + all_events.extend(station_events) + processed_count += 1 + + return all_events + + +# ── SURFRAD download ────────────────────────────────────────────────────────── + +SURFRAD_STATIONS = { + "dra": { + "name": "Desert Rock", + "lat": 36.626, "lon": -116.018, + "ftp_dir": "DRA", + }, + "tbl": { + "name": "Table Mountain", + "lat": 40.125, "lon": -105.237, + "ftp_dir": "TBL", + }, +} +SURFRAD_BASE = "https://gml.noaa.gov/aftp/data/radiation/surfrad" + + +def parse_surfrad_file(content: bytes, station_meta: dict) -> pd.DataFrame: + """ + Parse SURFRAD dat file format. + Returns dataframe with [timestamp, ghi] columns. + """ + lines = content.decode("utf-8", errors="replace").splitlines() + + # Skip header (first line is site info, second is column header) + # Format: year jday hour min dt zen dw_solar dw_solar_stdev ... + records = [] + for line in lines[2:]: + parts = line.split() + if len(parts) < 9: + continue + try: + yr = int(parts[0]) + jday = int(parts[1]) + hr = int(parts[2]) + mn = int(parts[3]) + # GHI is column index 6 (dw_solar = downwelling shortwave) + ghi_raw = float(parts[6]) + + # Build datetime + dt = datetime(yr, 1, 1, hr, mn, tzinfo=timezone.utc) + timedelta(days=jday - 1) + + # SURFRAD uses -9999 for missing + ghi = None if ghi_raw < -900 else max(0.0, ghi_raw) + records.append({"timestamp": dt, "ghi": ghi}) + except (ValueError, IndexError): + continue + + return pd.DataFrame(records) + + +def process_surfrad(years: list[int] | None = None) -> list[dict]: + """Download and process SURFRAD Desert Rock and Table Mountain.""" + print("\n=== SURFRAD dataset (NOAA GML FTP) ===") + if years is None: + years = [2018, 2019, 2020] # 3 recent years + + all_events = [] + + for code, meta in SURFRAD_STATIONS.items(): + station_dir = meta["ftp_dir"] + print(f"\n Station: {meta['name']} ({code}) | lat={meta['lat']} lon={meta['lon']}") + + for year in years: + print(f" Year {year}:") + station_events = [] + + # SURFRAD stores one file per day: DRA/YYYY/draYYYjjj.dat + # where jjj = day of year (001-365) + # We'll try to get the full year index + year_dir_url = f"{SURFRAD_BASE}/{station_dir}/{year}/" + try: + r = requests.get(year_dir_url, timeout=30) + if r.status_code != 200: + print(f" Cannot access {year_dir_url}: {r.status_code}") + continue + # Parse file listing (Apache directory index) + import re + files = re.findall(rf'{code.lower()}\d{{5}}\.dat', r.text) + files = sorted(set(files)) + print(f" Found {len(files)} daily files") + + dest_dir = RAW_BSRN / "surfrad" / station_dir / str(year) + dest_dir.mkdir(parents=True, exist_ok=True) + + day_records = [] + for fname in files: + dest = dest_dir / fname + file_url = f"{year_dir_url}{fname}" + ok = download_file(file_url, dest, fname) + if not ok: + continue + with open(dest, "rb") as f: + content = f.read() + day_df = parse_surfrad_file(content, meta) + if not day_df.empty: + day_records.append(day_df) + + if day_records: + df_year = pd.concat(day_records, ignore_index=True) + print(f" {len(df_year):,} 1-min records loaded", end=" ") + evts = extract_twilight_events( + df_year, meta["lat"], meta["lon"], + code, meta["name"], f"surfrad_{year}", + ) + print(f"→ {len(evts)} twilight events") + station_events.extend(evts) + + except Exception as e: + print(f" ERROR: {e}") + continue + + all_events.extend(station_events) + + return all_events + + +# ── save results ────────────────────────────────────────────────────────────── + +def save_sightings(events: list[dict], filename: str) -> Path: + """Save events list to CSV.""" + if not events: + print(f" WARNING: no events to save to {filename}") + return RAW_SIGHTINGS / filename + df = pd.DataFrame(events) + df = df.sort_values(["station_code", "date", "event_type"]).reset_index(drop=True) + out = RAW_SIGHTINGS / filename + df.to_csv(out, index=False) + print(f" Saved {len(df)} events → {out}") + return out + + +def print_summary(events: list[dict], label: str): + if not events: + print(f"\n{label}: no events") + return + df = pd.DataFrame(events) + print(f"\n{'='*60}") + print(f"SUMMARY: {label}") + print(f" Total events: {len(df)}") + print(f" Dawn events: {(df['event_type']=='dawn').sum()}") + print(f" Dusk events: {(df['event_type']=='dusk').sum()}") + print(f" Stations: {df['station_code'].nunique()} — {sorted(df['station_code'].unique())}") + print(f" Date range: {df['date'].min()} to {df['date'].max()}") + dep = df["solar_depression_deg"] + print(f" Depression angle: mean={dep.mean():.2f}° std={dep.std():.2f}° " + f"min={dep.min():.2f}° max={dep.max():.2f}°") + if "event_type" in df.columns: + for evt_type in ["dawn", "dusk"]: + sub = df[df["event_type"] == evt_type] + if not sub.empty: + d = sub["solar_depression_deg"] + print(f" {evt_type.upper()} depression: mean={d.mean():.2f}° " + f"p10={d.quantile(0.1):.2f}° p90={d.quantile(0.9):.2f}°") + print(f"{'='*60}") + + +# ── main ────────────────────────────────────────────────────────────────────── + +def main(): + print("BSRN Twilight Extractor") + print(f"Output dir: {RAW_SIGHTINGS}") + print(f"Raw data: {RAW_BSRN}") + print(f"GHI threshold: {PRIMARY_THRESHOLD} W/m²\n") + + all_events = [] + + # 1. BSRN 5-site (quick, 58.5 MB) + five_site_events = process_five_site() + save_sightings(five_site_events, "bsrn_5site_twilight.csv") + print_summary(five_site_events, "BSRN 5-site dataset") + all_events.extend(five_site_events) + + # 2. CAELUS (priority stations, up to 8 stations × 5 years) + caelus_events = process_caelus(max_stations=8, max_years_per_station=5) + save_sightings(caelus_events, "bsrn_caelus_twilight.csv") + print_summary(caelus_events, "CAELUS dataset") + all_events.extend(caelus_events) + + # 3. SURFRAD (Desert Rock + Table Mountain) + surfrad_events = process_surfrad(years=[2018, 2019, 2020]) + save_sightings(surfrad_events, "surfrad_twilight.csv") + print_summary(surfrad_events, "SURFRAD dataset") + all_events.extend(surfrad_events) + + # Combined + save_sightings(all_events, "bsrn_all_twilight.csv") + print_summary(all_events, "ALL SOURCES COMBINED") + + print("\nDone.") + + +if __name__ == "__main__": + main() diff --git a/src/bsrn_process.py b/src/bsrn_process.py new file mode 100644 index 0000000..b6568cd --- /dev/null +++ b/src/bsrn_process.py @@ -0,0 +1,563 @@ +""" +BSRN Twilight Extractor — corrected for actual file formats. + +Dataset formats: +1. BSRN 5-site parquet (Zenodo 10593079): + - Multi-index: (times_utc, site) + - Columns: climate, longitude, latitude, sza, eth, ghicda, ghics, difcs, ghi, dif, sky_type + - 'site' values like 'asp/bsrn', 'bon/bsrn', etc. + +2. CAELUS ZIPs (Zenodo 7897639): + - Each ZIP contains one CSV named {code}_bsrn_{year}.csv + - Columns: times_utc, sza, eth, ghi, dif, ghics, ghicda, sky_type + - sza = solar zenith angle (degrees). Depression = sza - 90 when sza > 90. + +For twilight extraction we use BOTH ghi AND dif (diffuse horizontal irradiance). +Diffuse is more sensitive at twilight — it captures scattered skylight before direct beam arrives. + +The `sza` column is authoritative for solar geometry — no need to recompute. +Depression angle = sza - 90° (valid when sza > 90°, i.e., sun below horizon). + +Thresholds extracted (multi-threshold approach): +- 0.5, 1, 2, 5, 10 W/m² for both ghi and dif + +Primary threshold for the main output: dif >= 1.0 W/m² (diffuse is more reliable at twilight). +""" + +import os +import math +import zipfile +import requests +import re +import pandas as pd +import numpy as np +from datetime import datetime, timezone, timedelta +from pathlib import Path +from io import BytesIO + +# ── paths ───────────────────────────────────────────────────────────────────── +BASE = Path("/Volumes/X9/Sites/acamarata/pray-calc-ml") +RAW_BSRN = BASE / "data/raw/bsrn" +RAW_SIGHTINGS = BASE / "data/raw/raw_sightings" +RAW_BSRN.mkdir(parents=True, exist_ok=True) +RAW_SIGHTINGS.mkdir(parents=True, exist_ok=True) + +# ── thresholds ──────────────────────────────────────────────────────────────── +THRESHOLDS = [0.5, 1.0, 2.0, 5.0, 10.0] +PRIMARY_THRESHOLD = 1.0 # W/m² + +# ── station metadata ────────────────────────────────────────────────────────── +# For CAELUS: station codes and their coordinates +CAELUS_STATION_META = { + "sbo": {"name": "Sede Boqer", "lat": 30.8597, "lon": 34.7794, "region": "Negev/Israel"}, + "tam": {"name": "Tamanrasset", "lat": 22.7903, "lon": 5.5292, "region": "Algeria"}, + "xia": {"name": "Xianghe", "lat": 39.754, "lon": 116.962, "region": "China"}, + "iza": {"name": "Izana", "lat": 28.3093, "lon": -16.4993, "region": "Canary Islands/Spain"}, + "ish": {"name": "Ishigakijima", "lat": 24.3367, "lon": 124.1644, "region": "Japan"}, + "dra": {"name": "Desert Rock", "lat": 36.626, "lon": -116.018, "region": "Nevada/USA"}, + "e13": {"name": "SouthernGreatPlains", "lat": 36.605, "lon": -97.485, "region": "Oklahoma/USA"}, + "tat": {"name": "Tateno", "lat": 36.0581, "lon": 140.1258, "region": "Japan"}, + "fua": {"name": "Fukuoka", "lat": 33.5822, "lon": 130.3764, "region": "Japan"}, + "gcr": {"name": "GoodwinCreek", "lat": 34.2547, "lon": -89.8729, "region": "Mississippi/USA"}, + "bil": {"name": "Billings/ARM", "lat": 36.605, "lon": -97.516, "region": "Oklahoma/USA"}, + "abs": {"name": "Abashiri", "lat": 44.0178, "lon": 144.2797, "region": "Japan"}, + "lyu": {"name": "Lanyu Island", "lat": 22.037, "lon": 121.5583, "region": "Taiwan"}, + "lln": {"name": "Lulin", "lat": 23.4686, "lon": 120.8736, "region": "Taiwan"}, + "cnr": {"name": "Cener", "lat": 42.816, "lon": -1.601, "region": "Spain/Navarra"}, + "flo": {"name": "Florina", "lat": 40.78, "lon": 21.38, "region": "Greece"}, + "car": {"name": "Carpentras", "lat": 44.0833, "lon": 5.0583, "region": "France"}, +} + +# Five-site parquet station metadata (lat/lon stored in the file itself) +FIVE_SITE_MAP = { + "mnm": {"name": "Minamitorishima", "region": "Japan"}, + "asp": {"name": "Alice Springs", "region": "Australia"}, + "car": {"name": "Carpentras", "region": "France"}, + "bon": {"name": "Bondville", "region": "USA/Illinois"}, + "son": {"name": "Sonnblick", "region": "Austria"}, +} + +CAELUS_BASE_URL = "https://zenodo.org/api/records/7897639/files/{filename}/content" +FIVE_SITE_URL = "https://zenodo.org/records/10593079/files/bsrn_valid_five_sites.parquet" +FIVE_SITE_PATH = RAW_BSRN / "bsrn_valid_five_sites.parquet" + +# ── download helper ─────────────────────────────────────────────────────────── + +def download_file(url: str, dest: Path, label: str = "") -> bool: + if dest.exists() and dest.stat().st_size > 1000: + print(f" [cached] {label or dest.name}") + return True + try: + print(f" [dl] {label or dest.name} ...", end=" ", flush=True) + r = requests.get(url, timeout=180, stream=True) + r.raise_for_status() + downloaded = 0 + with open(dest, "wb") as f: + for chunk in r.iter_content(chunk_size=131072): + f.write(chunk) + downloaded += len(chunk) + print(f"OK ({downloaded/1_048_576:.1f} MB)") + return True + except Exception as e: + print(f"FAILED: {e}") + if dest.exists(): + dest.unlink() + return False + + +# ── twilight extraction ─────────────────────────────────────────────────────── + +def extract_events_from_df( + df: pd.DataFrame, + lat: float, + lon: float, + station_code: str, + station_name: str, + source: str, +) -> list[dict]: + """ + df must have columns: times_utc (datetime64[ns] UTC), sza (float), ghi (float), dif (float). + + For each day: + dawn = first minute where dif (or ghi if dif is null) crosses PRIMARY_THRESHOLD + AND sza > 90 (sun still below horizon) + dusk = last minute where dif (or ghi) >= PRIMARY_THRESHOLD + AND sza > 90 + + Records depression angle = sza - 90.0 at the event moment. + Also records raw threshold-by-threshold data for all THRESHOLDS. + """ + events = [] + + df = df.copy() + df["times_utc"] = pd.to_datetime(df["times_utc"], utc=True, errors="coerce") + df = df.dropna(subset=["times_utc"]).sort_values("times_utc").reset_index(drop=True) + + # Build irradiance signal: prefer dif (diffuse) which is more sensitive at twilight + # Fall back to ghi if dif is all NaN + if "dif" in df.columns: + df["irr"] = pd.to_numeric(df["dif"], errors="coerce") + else: + df["irr"] = float("nan") + + if "ghi" in df.columns: + ghi_vals = pd.to_numeric(df["ghi"], errors="coerce") + # Fill irr NaN with ghi + df["irr"] = df["irr"].combine_first(ghi_vals) + + df["sza"] = pd.to_numeric(df["sza"], errors="coerce") + df["date"] = df["times_utc"].dt.date + + for day, grp in df.groupby("date"): + grp = grp.sort_values("times_utc").reset_index(drop=True) + + irr = grp["irr"].values + sza = grp["sza"].values + ts = grp["times_utc"].values + + n = len(irr) + if n < 10: + continue + + # For each threshold, extract dawn/dusk + for threshold in THRESHOLDS: + # Dawn: first index where irr >= threshold AND sza > 90 + # Require at least 2 consecutive sub-threshold readings immediately before + dawn_idx = None + for i in range(2, n): + if ( + not np.isnan(irr[i]) + and irr[i] >= threshold + and not np.isnan(sza[i]) + and sza[i] > 90.0 + ): + # Confirm at least 2 prior readings were sub-threshold + prev_ok = all( + (np.isnan(v) or v < threshold) + for v in irr[max(0, i-3):i] + ) + if prev_ok: + dawn_idx = i + break + + # Dusk: last index where irr >= threshold AND sza > 90 + # Require at least 2 consecutive sub-threshold readings immediately after + dusk_idx = None + for i in range(n - 3, -1, -1): + if ( + not np.isnan(irr[i]) + and irr[i] >= threshold + and not np.isnan(sza[i]) + and sza[i] > 90.0 + ): + after_ok = all( + (np.isnan(v) or v < threshold) + for v in irr[i+1:min(n, i+4)] + ) + if after_ok: + dusk_idx = i + break + + for idx, event_type in [(dawn_idx, "dawn"), (dusk_idx, "dusk")]: + if idx is None: + continue + + sza_val = float(sza[idx]) + # sza must be > 90 (sun below horizon) — extra safety check + if sza_val <= 90.0: + continue + # Depression angle in degrees + dep = round(sza_val - 90.0, 4) + # Only record plausible twilight depression (0.5° to 20°) + if dep < 0.3 or dep > 20.0: + continue + + utc_ts = pd.Timestamp(ts[idx]) + irr_val = float(irr[idx]) + + # Also grab ghi/dif individually at this moment + ghi_at = float(grp["ghi"].iloc[idx]) if "ghi" in grp.columns else float("nan") + dif_at = float(grp["dif"].iloc[idx]) if "dif" in grp.columns else float("nan") + + events.append({ + "date": str(day), + "event_type": event_type, + "utc_time": utc_ts.strftime("%H:%M:%S"), + "sza_deg": round(sza_val, 4), + "solar_depression_deg": dep, + "irr_wm2": round(irr_val, 2), + "ghi_wm2": round(ghi_at, 2) if not math.isnan(ghi_at) else None, + "dif_wm2": round(dif_at, 2) if not math.isnan(dif_at) else None, + "threshold_wm2": threshold, + "lat": lat, + "lon": lon, + "station_code": station_code, + "station_name": station_name, + "source": source, + }) + + return events + + +# ── 5-site dataset ──────────────────────────────────────────────────────────── + +def process_five_site() -> list[dict]: + print("\n=== BSRN 5-site parquet (Zenodo 10593079) ===") + ok = download_file(FIVE_SITE_URL, FIVE_SITE_PATH, "bsrn_valid_five_sites.parquet") + if not ok: + return [] + + print(" Loading ...", end=" ", flush=True) + df_raw = pd.read_parquet(FIVE_SITE_PATH) + df_raw = df_raw.reset_index() # brings times_utc and site into columns + print(f"OK — {len(df_raw):,} rows") + print(f" Columns: {list(df_raw.columns)}") + + all_events = [] + for site_val, grp in df_raw.groupby("site"): + # site_val is like 'asp/bsrn' + code = str(site_val).split("/")[0].lower() + meta = FIVE_SITE_MAP.get(code, {"name": code, "region": "unknown"}) + + # Get lat/lon from the data itself + lat = float(grp["latitude"].iloc[0]) + lon = float(grp["longitude"].iloc[0]) + + # Rename columns to standard + sub = grp[["times_utc", "sza", "ghi", "dif"]].copy() + + print(f" {meta['name']} ({code}) lat={lat:.3f} lon={lon:.3f}: " + f"{len(sub):,} rows ...", end=" ", flush=True) + + evts = extract_events_from_df(sub, lat, lon, code, meta["name"], "bsrn_5site") + print(f"{len(evts)} events") + all_events.extend(evts) + + return all_events + + +# ── CAELUS dataset ───────────────────────────────────────────────────────────── + +def list_caelus_files() -> dict[str, list[int]]: + """Return dict of station_code -> sorted list of available years.""" + print(" Querying Zenodo API ...", end=" ", flush=True) + try: + r = requests.get("https://zenodo.org/api/records/7897639", timeout=30) + r.raise_for_status() + data = r.json() + available: dict[str, list[int]] = {} + for f in data.get("files", []): + fname = f.get("key", "") + if fname.endswith(".zip") and "_bsrn_" in fname: + parts = fname.replace(".zip", "").split("_bsrn_") + if len(parts) == 2: + code = parts[0] + try: + year = int(parts[1]) + available.setdefault(code, []).append(year) + except ValueError: + pass + for k in available: + available[k] = sorted(available[k]) + print(f"{len(available)} stations found") + return available + except Exception as e: + print(f"FAILED: {e}") + return {} + + +def process_caelus_zip(zip_path: Path, station_code: str, meta: dict, year: int) -> list[dict]: + events = [] + try: + with zipfile.ZipFile(zip_path, "r") as zf: + csv_files = [n for n in zf.namelist() if n.endswith(".csv")] + if not csv_files: + print(f" WARNING: no CSV in {zip_path.name}") + return [] + with zf.open(csv_files[0]) as f: + df = pd.read_csv(f) + + # Validate expected columns + required = {"times_utc", "sza", "ghi"} + missing = required - set(df.columns) + if missing: + print(f" WARNING: missing columns {missing} in {zip_path.name}") + return [] + + evts = extract_events_from_df( + df, meta["lat"], meta["lon"], + station_code, meta["name"], f"caelus_{year}", + ) + return evts + + except zipfile.BadZipFile: + print(f" ERROR: corrupt zip {zip_path.name}") + return [] + except Exception as e: + print(f" ERROR {zip_path.name}: {e}") + return [] + + +def process_caelus(priority_codes: list[str] | None = None, max_years: int = 5) -> list[dict]: + print("\n=== CAELUS dataset (Zenodo 7897639) ===") + available = list_caelus_files() + if not available: + return [] + + if priority_codes is None: + # All known stations, sorted by relevance + priority_codes = list(CAELUS_STATION_META.keys()) + + all_events = [] + + for code in priority_codes: + if code not in available: + print(f" SKIP {code}: not in CAELUS") + continue + + meta = CAELUS_STATION_META.get(code) + if meta is None: + print(f" SKIP {code}: no metadata") + continue + + years = available[code][:max_years] + print(f"\n {meta['name']} ({code}) " + f"lat={meta['lat']} lon={meta['lon']} region={meta['region']}") + print(f" Years: {years}") + + station_events = [] + for year in years: + fname = f"{code}_bsrn_{year}.zip" + url = CAELUS_BASE_URL.format(filename=fname) + dest = RAW_BSRN / fname + + ok = download_file(url, dest, fname) + if not ok: + continue + + print(f" {fname} ...", end=" ", flush=True) + evts = process_caelus_zip(dest, code, meta, year) + print(f"{len(evts)} events ({year})") + station_events.extend(evts) + + print(f" → {len(station_events)} events total for {code}") + all_events.extend(station_events) + + return all_events + + +# ── SURFRAD ─────────────────────────────────────────────────────────────────── + +SURFRAD_STATIONS = { + "dra": {"name": "Desert Rock", "lat": 36.626, "lon": -116.018, "dir": "DRA"}, + "tbl": {"name": "TableMountain","lat": 40.125, "lon": -105.237, "dir": "TBL"}, +} +SURFRAD_BASE = "https://gml.noaa.gov/aftp/data/radiation/surfrad" + + +def parse_surfrad_day(content: bytes) -> pd.DataFrame: + """Parse a SURFRAD daily .dat file to a dataframe with [times_utc, sza, ghi, dif].""" + lines = content.decode("utf-8", errors="replace").splitlines() + records = [] + for line in lines[2:]: + parts = line.split() + if len(parts) < 10: + continue + try: + yr = int(parts[0]) + jday = int(parts[1]) + hr = int(parts[2]) + mn = int(parts[3]) + zen = float(parts[5]) # solar zenith angle (degrees) + ghi = float(parts[6]) # downwelling shortwave (W/m²) + dif = float(parts[9]) # diffuse shortwave (W/m²) + + dt = datetime(yr, 1, 1, hr, mn, tzinfo=timezone.utc) + timedelta(days=jday - 1) + + # -9999 = missing + records.append({ + "times_utc": dt, + "sza": None if zen < -900 else zen, + "ghi": None if ghi < -900 else max(0.0, ghi), + "dif": None if dif < -900 else max(0.0, dif), + }) + except (ValueError, IndexError): + continue + return pd.DataFrame(records) + + +def process_surfrad(years: list[int] | None = None) -> list[dict]: + print("\n=== SURFRAD (NOAA GML) ===") + if years is None: + years = [2018, 2019, 2020] + + all_events = [] + + for code, meta in SURFRAD_STATIONS.items(): + print(f"\n {meta['name']} ({code}) lat={meta['lat']} lon={meta['lon']}") + dest_root = RAW_BSRN / "surfrad" / meta["dir"] + dest_root.mkdir(parents=True, exist_ok=True) + + for year in years: + year_url = f"{SURFRAD_BASE}/{meta['dir']}/{year}/" + try: + r = requests.get(year_url, timeout=30) + if r.status_code != 200: + print(f" {year}: HTTP {r.status_code} — skipping") + continue + # Parse file listing + files = sorted(set(re.findall(rf'{code}\d{{5}}\.dat', r.text))) + print(f" {year}: {len(files)} daily files", end=" ") + except Exception as e: + print(f" {year}: ERROR listing: {e}") + continue + + dest_dir = dest_root / str(year) + dest_dir.mkdir(parents=True, exist_ok=True) + + day_dfs = [] + for fname in files: + dest = dest_dir / fname + furl = f"{year_url}{fname}" + ok = download_file(furl, dest, fname) + if not ok: + continue + with open(dest, "rb") as f: + day_df = parse_surfrad_day(f.read()) + if not day_df.empty: + day_dfs.append(day_df) + + if not day_dfs: + print("→ 0 events") + continue + + df_year = pd.concat(day_dfs, ignore_index=True) + evts = extract_events_from_df( + df_year, meta["lat"], meta["lon"], + code, meta["name"], f"surfrad_{year}", + ) + print(f"→ {len(evts)} events") + all_events.extend(evts) + + return all_events + + +# ── save / summarise ────────────────────────────────────────────────────────── + +def save_events(events: list[dict], filename: str) -> Path: + out = RAW_SIGHTINGS / filename + if not events: + print(f" (no events for {filename})") + return out + df = pd.DataFrame(events) + df = df.sort_values(["station_code", "date", "event_type", "threshold_wm2"]) + df.to_csv(out, index=False) + print(f" Saved {len(df):,} rows → {out}") + return out + + +def summarise(events: list[dict], label: str): + print(f"\n{'='*65}") + print(f"SUMMARY: {label}") + if not events: + print(" (no events)") + print("=" * 65) + return + df = pd.DataFrame(events) + # Focus summary on primary threshold + prim = df[df["threshold_wm2"] == PRIMARY_THRESHOLD] + print(f" All threshold events: {len(df):,}") + print(f" At {PRIMARY_THRESHOLD} W/m² threshold: {len(prim):,}") + print(f" Stations: {df['station_code'].nunique()} — {sorted(df['station_code'].unique())}") + if not prim.empty: + print(f" Date range: {prim['date'].min()} to {prim['date'].max()}") + for evt_type in ["dawn", "dusk"]: + sub = prim[prim["event_type"] == evt_type] + if sub.empty: + continue + d = sub["solar_depression_deg"] + print(f" {evt_type.upper()} events: {len(sub):,} | " + f"mean dep={d.mean():.2f}° std={d.std():.2f}° " + f"p10={d.quantile(0.1):.2f}° p90={d.quantile(0.9):.2f}°") + print("=" * 65) + + +# ── main ────────────────────────────────────────────────────────────────────── + +def main(): + print("BSRN Twilight Extractor v2") + print(f"Primary threshold: {PRIMARY_THRESHOLD} W/m²") + print(f"All thresholds: {THRESHOLDS}") + print() + + all_events = [] + + # 1. BSRN 5-site + five_events = process_five_site() + save_events(five_events, "bsrn_5site_twilight.csv") + summarise(five_events, "BSRN 5-site") + all_events.extend(five_events) + + # 2. CAELUS — priority stations in 20-45°N + priority = ["sbo", "tam", "xia", "iza", "ish", "dra", "e13", "tat", "fua", + "gcr", "bil", "abs", "lyu", "cnr", "flo"] + caelus_events = process_caelus(priority_codes=priority, max_years=5) + save_events(caelus_events, "bsrn_caelus_twilight.csv") + summarise(caelus_events, "CAELUS") + all_events.extend(caelus_events) + + # 3. SURFRAD — Desert Rock (arid, 36.6°N) + Table Mountain + surfrad_events = process_surfrad(years=[2018, 2019, 2020]) + save_events(surfrad_events, "surfrad_twilight.csv") + summarise(surfrad_events, "SURFRAD") + all_events.extend(surfrad_events) + + # Combined output + save_events(all_events, "bsrn_all_twilight.csv") + summarise(all_events, "ALL SOURCES") + + print("\nDone.") + + +if __name__ == "__main__": + main() diff --git a/src/collect/academic_paper_fetcher.py b/src/collect/academic_paper_fetcher.py new file mode 100644 index 0000000..1e4e1a6 --- /dev/null +++ b/src/collect/academic_paper_fetcher.py @@ -0,0 +1,93 @@ +import requests +import os +import re +from pathlib import Path +import time + +PAPERS_DIR = Path("research/downloaded_papers") +PAPERS_DIR.mkdir(parents=True, exist_ok=True) + +# Queries targeting Islamic twilight observations +QUERIES = [ + "fajr twilight observation angle", + "subuh sky brightness meter", + "shafaq twilight measurement isha", + "depression angle fajr isha", + "true dawn observation measurements" +] + +def fetch_openalex_papers(query): + url = f"https://api.openalex.org/works?search={query}&per-page=50&filter=has_pdf_url:true" + print(f"Querying OpenAlex: {query}") + try: + res = requests.get(url, timeout=15) + if res.status_code != 200: + return [] + data = res.json() + return data.get("results", []) + except Exception as e: + print(f"Error fetching {query}: {e}") + return [] + +def sanitize_filename(title): + return re.sub(r"[^a-zA-Z0-9]+", "_", title).strip("_") + +def main(): + seen_titles = set() + total_downloaded = 0 + + # Check existing files to avoid duplicates + existing_files = list(PAPERS_DIR.glob("*.pdf")) + for file in existing_files: + name_no_ext = file.stem.lower() + seen_titles.add(name_no_ext) + + for query in QUERIES: + works = fetch_openalex_papers(query) + for work in works: + title = work.get("title") + if not title: + continue + + clean_title = sanitize_filename(title).lower() + + # Simple duplicate check + if any(clean_title[:30] in seen[:30] for seen in seen_titles): + print(f"Skipping duplicate/known: {title}") + continue + + pdf_url = work.get("open_access", {}).get("oa_url") + if not pdf_url or not pdf_url.endswith(".pdf"): + continue + + year = work.get("publication_year", "0000") + filename = f"{year}_{clean_title[:60]}.pdf" + filepath = PAPERS_DIR / filename + + if filepath.exists(): + print(f"Already downloaded: {filename}") + continue + + print(f"Downloading: {title} ({year})") + print(f" URL: {pdf_url}") + try: + # Add headers to mimic browser + headers = {'User-Agent': 'Mozilla/5.0'} + pdf_res = requests.get(pdf_url, headers=headers, timeout=20) + if pdf_res.status_code == 200: + with open(filepath, "wb") as f: + f.write(pdf_res.content) + print(f" -> Saved as {filename}") + seen_titles.add(clean_title) + total_downloaded += 1 + else: + print(f" -> Failed with status {pdf_res.status_code}") + except Exception as e: + print(f" -> Download failed: {e}") + + time.sleep(1) # Be nice to servers + + print(f"\nTotal new papers downloaded: {total_downloaded}") + +if __name__ == "__main__": + main() diff --git a/src/collect/download_gan_mn.py b/src/collect/download_gan_mn.py new file mode 100644 index 0000000..6383530 --- /dev/null +++ b/src/collect/download_gan_mn.py @@ -0,0 +1,226 @@ +""" +Download all GaN-MN monthly and annual CSV files, then process each one +with the GaN-MN processor and save results to data/raw/raw_sightings/. + +Run from repo root: + /Volumes/X9/Sites/acamarata/pray-calc-ml/.venv/bin/python \ + src/collect/download_gan_mn.py +""" +from __future__ import annotations + +import csv +import logging +import sys +import time +from pathlib import Path + +import requests + +# Add repo root to path so we can import the processor +REPO_ROOT = Path(__file__).resolve().parent.parent.parent +sys.path.insert(0, str(REPO_ROOT)) + +from src.collect.gan_mn_processor import process_gan_mn_csv # noqa: E402 + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + datefmt="%H:%M:%S", +) +log = logging.getLogger(__name__) + +RAW_DIR = REPO_ROOT / "data" / "raw" / "gan_mn" +SIGHTINGS_DIR = REPO_ROOT / "data" / "raw" / "raw_sightings" +RAW_DIR.mkdir(parents=True, exist_ok=True) +SIGHTINGS_DIR.mkdir(parents=True, exist_ok=True) + +# All monthly files, 2022-2025 (skip already-downloaded ones) +# Format: (label, bit.ly URL, local filename) +MONTHLY_FILES = [ + # 2025 (skip Jun 2025 and Jan 2025 — already have them) + ("Aug_2025", "https://bit.ly/4a9aXIP", "GaN-MN_August_2025.csv"), + ("Sep_2025", "https://bit.ly/4sm4pxY", "GaN-MN_September_2025.csv"), + ("Jul_2025", "https://bit.ly/4jlEQtz", "GaN-MN_July_2025.csv"), + ("May_2025", "https://bit.ly/47w5sCA", "GaN-MN_May_2025.csv"), + ("Apr_2025", "https://bit.ly/4nxs6Bm", "GaN-MN_April_2025.csv"), + ("Mar_2025", "https://bit.ly/47mPHzx", "GaN-MN_March_2025.csv"), + ("Feb_2025", "https://bit.ly/4lbwPXg", "GaN-MN_February_2025.csv"), + # 2024 (skip Aug 2024 — already have it) + ("Dec_2024", "https://bit.ly/4mHSBno", "GaN-MN_December_2024.csv"), + ("Nov_2024", "https://bit.ly/4lVwidL", "GaN-MN_November_2024.csv"), + ("Oct_2024", "https://bit.ly/449h3ay", "GaN-MN_October_2024.csv"), + ("Sep_2024", "https://bit.ly/4hUE458", "GaN-MN_September_2024.csv"), + ("Jul_2024", "https://bit.ly/4fAjvJi", "GaN-MN_July_2024.csv"), + ("Jun_2024", "https://bit.ly/3ZvT1DR", "GaN-MN_June_2024.csv"), + ("May_2024", "https://bit.ly/4f3veR8", "GaN-MN_May_2024.csv"), + ("Apr_2024", "https://bit.ly/4etfYg0", "GaN-MN_April_2024.csv"), + ("Mar_2024", "https://bit.ly/3Z69pLy", "GaN-MN_March_2024.csv"), + ("Feb_2024", "https://bit.ly/46wtyfJ", "GaN-MN_February_2024.csv"), + ("Jan_2024", "https://bit.ly/4cMztz7", "GaN-MN_January_2024.csv"), + # 2023 + ("Dec_2023", "https://bit.ly/45dEDl3", "GaN-MN_December_2023.csv"), + ("Nov_2023", "https://bit.ly/4ajnV4v", "GaN-MN_November_2023.csv"), + ("Oct_2023", "https://bit.ly/3PKzryD", "GaN-MN_October_2023.csv"), + ("Sep_2023", "https://bit.ly/3ww3T8K", "GaN-MN_September_2023.csv"), + ("Aug_2023", "https://bit.ly/3So1UKV", "GaN-MN_August_2023.csv"), + ("Jul_2023", "https://bit.ly/3S3mG3u", "GaN-MN_July_2023.csv"), + ("Jun_2023", "https://bit.ly/46Es4OL", "GaN-MN_June_2023.csv"), + ("May_2023", "https://bit.ly/46WbF9m", "GaN-MN_May_2023.csv"), + ("Apr_2023", "https://bit.ly/3rxWWSy", "GaN-MN_April_2023.csv"), + ("Mar_2023", "https://bit.ly/3OW45nv", "GaN-MN_March_2023.csv"), + ("Feb_2023", "https://bit.ly/3rSY2br", "GaN-MN_February_2023.csv"), + ("Jan_2023", "https://bit.ly/3JGJpy1", "GaN-MN_January_2023.csv"), + # 2022 + ("Dec_2022", "https://bit.ly/43CelXB", "GaN-MN_December_2022.csv"), + ("Nov_2022", "https://bit.ly/429RdyR", "GaN-MN_November_2022.csv"), + ("Oct_2022", "https://bit.ly/3m1onBm", "GaN-MN_October_2022.csv"), + ("Sep_2022", "http://bit.ly/3SHQJN5", "GaN-MN_September_2022.csv"), + ("Aug_2022", "http://bit.ly/40hFpdP", "GaN-MN_August_2022.csv"), + ("Jul_2022", "https://bit.ly/3Z487OD", "GaN-MN_July_2022.csv"), + ("Jun_2022", "https://bit.ly/3H5CsWS", "GaN-MN_June_2022.csv"), + ("May_2022", "https://bit.ly/3fnFCJT", "GaN-MN_May_2022.csv"), + ("Apr_2022", "https://bit.ly/3E8flJr", "GaN-MN_April_2022.csv"), + ("Mar_2022", "https://bit.ly/3B76VAq", "GaN-MN_March_2022.csv"), + ("Feb_2022", "http://bit.ly/3BCkXe1", "GaN-MN_February_2022.csv"), + ("Jan_2022", "http://bit.ly/3IgfkU4", "GaN-MN_January_2022.csv"), +] + +# Annual file for 2021 (one representative year) +ANNUAL_FILES = [ + ("2021", "https://bit.ly/44vtt9T", "GaN-MN_Annual_2021.csv"), +] + +FIELDNAMES = [ + "prayer", "date_local", "time_local", "utc_offset", + "lat", "lng", "elevation_m", "source", "notes", +] + + +def resolve_and_download(label: str, url: str, dest: Path) -> bool: + """ + Follow bit.ly redirect and download the file to dest. + Returns True on success. + """ + if dest.exists(): + log.info("Already have %s — skipping download", dest.name) + return True + + log.info("Downloading %s from %s ...", label, url) + try: + # bit.ly does a redirect; requests follows by default + headers = { + "User-Agent": ( + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/124.0.0.0 Safari/537.36" + ) + } + with requests.get(url, headers=headers, stream=True, timeout=120) as resp: + resp.raise_for_status() + total = int(resp.headers.get("content-length", 0)) + downloaded = 0 + with open(dest, "wb") as fh: + for chunk in resp.iter_content(chunk_size=1 << 20): # 1 MB chunks + if chunk: + fh.write(chunk) + downloaded += len(chunk) + mb = downloaded / (1 << 20) + log.info("Downloaded %s: %.1f MB", dest.name, mb) + return True + except Exception as exc: + log.error("Failed to download %s: %s", label, exc) + if dest.exists(): + dest.unlink() + return False + + +def process_and_save(label: str, csv_path: Path) -> int: + """ + Run the GaN-MN processor on csv_path and write results to + data/raw/raw_sightings/gan_mn_{label_lower}.csv. + Returns count of records written. + """ + slug = label.lower().replace(" ", "_") + out_path = SIGHTINGS_DIR / f"gan_mn_{slug}.csv" + + if out_path.exists(): + # Count existing rows + with open(out_path) as fh: + existing = sum(1 for _ in fh) - 1 # subtract header + log.info("Already processed %s (%d rows) — skipping", out_path.name, existing) + return existing + + log.info("Processing %s ...", csv_path.name) + try: + records = process_gan_mn_csv(csv_path) + except Exception as exc: + log.error("Processing failed for %s: %s", csv_path.name, exc) + return 0 + + if not records: + log.warning("No records extracted from %s", csv_path.name) + return 0 + + with open(out_path, "w", newline="") as fh: + writer = csv.DictWriter(fh, fieldnames=FIELDNAMES) + writer.writeheader() + writer.writerows(records) + + log.info("Wrote %d records to %s", len(records), out_path.name) + return len(records) + + +def main() -> None: + all_files = MONTHLY_FILES + ANNUAL_FILES + total_downloaded = 0 + total_records = 0 + skipped_download = 0 + failed_download = [] + results = [] + + for label, url, filename in all_files: + dest = RAW_DIR / filename + ok = resolve_and_download(label, url, dest) + if not ok: + failed_download.append(label) + continue + if dest.stat().st_size < 1000: + log.error("%s appears too small (%d bytes), skipping processing", filename, dest.stat().st_size) + failed_download.append(label) + continue + + total_downloaded += 1 + count = process_and_save(label, dest) + total_records += count + results.append((label, count)) + + # Brief pause between files to avoid hammering the network + time.sleep(0.5) + + # Also count already-existing sightings from the 3 pre-downloaded months + pre_existing = ["august_2024", "january_2025", "june_2025"] + pre_total = 0 + for slug in pre_existing: + p = SIGHTINGS_DIR / f"gan_mn_{slug}.csv" + if p.exists(): + with open(p) as fh: + n = sum(1 for _ in fh) - 1 + pre_total += n + log.info("Pre-existing %s: %d rows", p.name, n) + + log.info("=" * 60) + log.info("Download summary:") + log.info(" Files newly processed: %d", total_downloaded) + log.info(" Records from new files: %d", total_records) + log.info(" Records from pre-existing 3 months: %d", pre_total) + log.info(" Grand total twilight events: %d", total_records + pre_total) + if failed_download: + log.warning(" Failed downloads: %s", ", ".join(failed_download)) + + log.info("Per-file breakdown:") + for label, count in results: + log.info(" %-15s %d", label, count) + + +if __name__ == "__main__": + main() diff --git a/src/collect/madrid_sqm_processor.py b/src/collect/madrid_sqm_processor.py new file mode 100644 index 0000000..6bb25d5 --- /dev/null +++ b/src/collect/madrid_sqm_processor.py @@ -0,0 +1,169 @@ +""" +Processor for the Madrid Zenodo SQM dataset (Record 4633001). + +This script reads data/raw/massive_sqm/SQM_evol.csv, representing 10 years of +continuous minute-by-minute Sky Quality Meter readings in Madrid, Spain. + +We apply the BRIN inflection-point detection algorithm: +- Fajr: Moment of steepest SQM decline in the pre-dawn window. +- Isha: Moment SQM reaches within 0.5 mag of the dark-night baseline. + +Output: data/raw/raw_sightings/madrid_sqm_10yr.csv +""" + +import math +from pathlib import Path +from datetime import datetime, timezone, timedelta +import pandas as pd +import ephem + +# Madrid UCM Observatory from paper metadata +LAT = 40.45 +LNG = -3.73 +ELEVATION_M = 640 +UTC_OFFSET = 1.0 # Winter UTC+1, though data is in UTC so we just use this for local labeling + +RAW_CSV = Path("data/raw/massive_sqm/SQM_evol.csv") +OUT_FILE = Path("data/raw/raw_sightings/madrid_sqm_10yr.csv") + +MIN_DARK_SKY_MPSAS = 18.0 +MAX_MOON_ALT = 5.0 + +def get_solar_altitude(utc_dt: datetime) -> float: + obs = ephem.Observer() + obs.lat = str(LAT) + obs.lon = str(LNG) + obs.elevation = ELEVATION_M + obs.pressure = 1013.25 + obs.temp = 15.0 + if utc_dt.tzinfo is not None: + utc_dt = utc_dt.replace(tzinfo=None) + obs.date = ephem.Date(utc_dt) + sun = ephem.Sun(obs) + return math.degrees(float(sun.alt)) + +def process_madrid_data(): + if not RAW_CSV.exists(): + print(f"File not found: {RAW_CSV}") + return + + print("Loading Madrid SQM dataset. This may take a minute...") + # Format: Time(UTC);Magnitude + df = pd.read_csv(RAW_CSV, sep=';', usecols=['Time(UTC)', 'Magnitude']) + df['utc_dt'] = pd.to_datetime(df['Time(UTC)']).dt.tz_localize(timezone.utc) + df = df.rename(columns={'Magnitude': 'mpsas'}) + + # Filter bad data + df = df[df['mpsas'] > 0] + df = df.sort_values('utc_dt').reset_index(drop=True) + + print(f"Loaded {len(df)} rows. Processing nights...") + + # Identify unique local calendar dates in the dataset + df['local_date'] = (df['utc_dt'] + timedelta(hours=UTC_OFFSET)).dt.date + unique_dates = df['local_date'].unique() + + records = [] + + obs = ephem.Observer() + obs.lat = str(LAT) + obs.lon = str(LNG) + obs.elevation = ELEVATION_M + obs.pressure = 1013.25 + obs.temp = 15.0 + + print(f"Found {len(unique_dates)} nights to process.") + for local_date in unique_dates: + # Midnight UTC for the given local date + noon_utc = datetime(local_date.year, local_date.month, local_date.day, 12, 0, tzinfo=timezone.utc) - timedelta(hours=UTC_OFFSET) + + obs.date = ephem.Date(noon_utc.replace(tzinfo=None)) + try: + sunset_utc = obs.next_setting(ephem.Sun()).datetime().replace(tzinfo=timezone.utc) + sunrise_utc = obs.next_rising(ephem.Sun()).datetime().replace(tzinfo=timezone.utc) + except ephem.AlwaysUpError: + continue + except ephem.NeverUpError: + continue + + # FAJR + window_start = sunrise_utc - timedelta(hours=5) + window_end = sunrise_utc - timedelta(minutes=10) + predawn = df[(df['utc_dt'] >= window_start) & (df['utc_dt'] <= window_end)].copy() + + if len(predawn) >= 30: + predawn['sun_alt'] = [get_solar_altitude(dt) for dt in predawn['utc_dt']] + predawn = predawn[predawn['sun_alt'] < -5.0] + + early = predawn[predawn['sun_alt'] < -18.0] + if early.empty: + early = predawn[predawn['sun_alt'] < -16.0] + if not early.empty and early['mpsas'].max() >= MIN_DARK_SKY_MPSAS: + predawn['mpsas_smooth'] = predawn['mpsas'].rolling(window=5, center=True, min_periods=3).mean() + predawn['dmpsas'] = predawn['mpsas_smooth'].diff() + + active = predawn[(predawn['sun_alt'] >= -25.0) & (predawn['sun_alt'] <= -5.0)] + if len(active) >= 10: + steepest_idx = active['dmpsas'].idxmin() + if not pd.isna(steepest_idx): + dawn_row = predawn.loc[steepest_idx] + depression_angle = -dawn_row['sun_alt'] + + if 10.0 <= depression_angle <= 22.0: + local_time = dawn_row['utc_dt'] + timedelta(hours=UTC_OFFSET) + records.append({ + "prayer": "fajr", + "date_local": local_time.strftime("%Y-%m-%d"), + "time_local": local_time.strftime("%H:%M"), + "utc_offset": UTC_OFFSET, + "lat": LAT, + "lng": LNG, + "elevation_m": ELEVATION_M, + "source": "Madrid Zenodo SQM (10-year)", + "notes": f"SQM inflection point Fajr detection; Madrid UCM Observatory; dark-sky baseline>={MIN_DARK_SKY_MPSAS}" + }) + + # ISHA + window_end_isha = sunset_utc + timedelta(hours=5) + evening = df[(df['utc_dt'] >= sunset_utc) & (df['utc_dt'] <= window_end_isha)].copy() + + if len(evening) >= 30: + evening['sun_alt'] = [get_solar_altitude(dt) for dt in evening['utc_dt']] + evening = evening[evening['sun_alt'] < 0] + + deep_night = evening[(evening['utc_dt'] >= sunset_utc + timedelta(hours=2)) & + (evening['utc_dt'] <= sunset_utc + timedelta(hours=6)) & + (evening['sun_alt'] < -18.0)] + + if len(deep_night) >= 10: + dark_baseline = deep_night['mpsas'].median() + if dark_baseline >= MIN_DARK_SKY_MPSAS: + evening['mpsas_smooth'] = evening['mpsas'].rolling(window=5, center=True, min_periods=3).mean() + threshold = dark_baseline - 0.5 + + reached = evening[(evening['mpsas_smooth'] >= threshold) & (evening['sun_alt'] < -12.0)] + if not reached.empty: + isha_row = reached.iloc[0] + depression_angle = -isha_row['sun_alt'] + + if 12.0 <= depression_angle <= 22.0: + local_time = isha_row['utc_dt'] + timedelta(hours=UTC_OFFSET) + records.append({ + "prayer": "isha", + "date_local": local_time.strftime("%Y-%m-%d"), + "time_local": local_time.strftime("%H:%M"), + "utc_offset": UTC_OFFSET, + "lat": LAT, + "lng": LNG, + "elevation_m": ELEVATION_M, + "source": "Madrid Zenodo SQM (10-year)", + "notes": f"SQM threshold Isha detection (Shafaq Abyad); dark-sky baseline={dark_baseline:.2f}" + }) + + out_df = pd.DataFrame(records) + OUT_FILE.parent.mkdir(parents=True, exist_ok=True) + out_df.to_csv(OUT_FILE, index=False) + print(f"Successfully processed {len(records)} Fajr/Isha observations to {OUT_FILE}.") + +if __name__ == "__main__": + process_madrid_data() diff --git a/src/surfrad_process.py b/src/surfrad_process.py new file mode 100644 index 0000000..6b132e5 --- /dev/null +++ b/src/surfrad_process.py @@ -0,0 +1,282 @@ +""" +SURFRAD Downloader and Twilight Extractor — standalone. + +Stations: +- dra: Desert Rock, Nevada (36.626°N, 116.018°W) — arid, excellent visibility +- tbl: Table Mountain, Colorado (40.125°N, 105.237°W) — high altitude (1689m) +- gwn: Goodwin Creek, Mississippi (34.255°N, 89.873°W) — humid subtropical + +File naming: {code}{YY}{DDD}.dat + YY = 2-digit year + DDD = day of year (001-366) + +Column layout (SURFRAD standard format, v1): + 0: year + 1: jday (day of year) + 2: month + 3: day + 4: hour (UTC) + 5: min + 6: dt (time difference in minutes from standard time) + 7: zen (solar zenith angle, degrees) + 8: dw_solar (downwelling shortwave, W/m²) ← GHI + 9: dw_solar_sd + 10: direct (direct normal irradiance, W/m²) + 11: direct_sd + 12: diffuse (diffuse horizontal, W/m²) ← DIF + 13: diffuse_sd + 14+: other measurements + +Missing value: -9999.9 +""" + +import re +import math +import requests +import pandas as pd +import numpy as np +from datetime import datetime, timezone, timedelta, date +from pathlib import Path + +BASE = Path("/Volumes/X9/Sites/acamarata/pray-calc-ml") +RAW_BSRN = BASE / "data/raw/bsrn" +RAW_SIGHTINGS = BASE / "data/raw/raw_sightings" + +THRESHOLDS = [0.5, 1.0, 2.0, 5.0, 10.0] +SURFRAD_BASE = "https://gml.noaa.gov/aftp/data/radiation/surfrad" + +STATIONS = { + "dra": {"name": "Desert Rock", "lat": 36.626, "lon": -116.018, "dir": "dra"}, + "tbl": {"name": "TableMountain", "lat": 40.125, "lon": -105.237, "dir": "tbl"}, + "gwn": {"name": "GoodwinCreek", "lat": 34.2547, "lon": -89.8729, "dir": "gwn"}, +} + + +def download_file(url: str, dest: Path) -> bool: + if dest.exists() and dest.stat().st_size > 500: + return True + try: + r = requests.get(url, timeout=60) + r.raise_for_status() + dest.write_bytes(r.content) + return True + except Exception: + return False + + +def parse_surfrad_dat(content: bytes) -> pd.DataFrame: + """ + Parse SURFRAD .dat file. + Returns df with [times_utc, sza, ghi, dif]. + """ + lines = content.decode("utf-8", errors="replace").splitlines() + + # First 2 lines are headers; data starts line 2 + records = [] + for line in lines[2:]: + parts = line.split() + if len(parts) < 13: + continue + try: + yr = int(parts[0]) + jday = int(parts[1]) + hr = int(parts[4]) + mn = int(parts[5]) + zen = float(parts[7]) + ghi = float(parts[8]) + dif = float(parts[12]) + + dt = datetime(yr, 1, 1, hr, mn, tzinfo=timezone.utc) + timedelta(days=jday - 1) + records.append({ + "times_utc": dt, + "sza": None if zen < -900 else zen, + "ghi": None if ghi < -900 else max(0.0, ghi), + "dif": None if dif < -900 else max(0.0, dif), + }) + except (ValueError, IndexError): + continue + + return pd.DataFrame(records) + + +def extract_events(df: pd.DataFrame, lat: float, lon: float, + station_code: str, station_name: str, + source: str) -> list[dict]: + """Same logic as bsrn_process.py extract_events_from_df.""" + events = [] + df = df.copy() + df["times_utc"] = pd.to_datetime(df["times_utc"], utc=True, errors="coerce") + df = df.dropna(subset=["times_utc"]).sort_values("times_utc").reset_index(drop=True) + + # Use diffuse as primary signal, fall back to GHI + df["irr"] = pd.to_numeric(df.get("dif", pd.Series()), errors="coerce") + if "ghi" in df.columns: + df["irr"] = df["irr"].combine_first(pd.to_numeric(df["ghi"], errors="coerce")) + + df["sza"] = pd.to_numeric(df["sza"], errors="coerce") + df["date"] = df["times_utc"].dt.date + + for day, grp in df.groupby("date"): + grp = grp.sort_values("times_utc").reset_index(drop=True) + irr = grp["irr"].values + sza = grp["sza"].values + ts = grp["times_utc"].values + n = len(irr) + if n < 10: + continue + + for threshold in THRESHOLDS: + dawn_idx = None + for i in range(2, n): + if (not np.isnan(irr[i]) and irr[i] >= threshold + and not np.isnan(sza[i]) and sza[i] > 90.0): + prev_ok = all( + (np.isnan(v) or v < threshold) + for v in irr[max(0, i-3):i] + ) + if prev_ok: + dawn_idx = i + break + + dusk_idx = None + for i in range(n - 3, -1, -1): + if (not np.isnan(irr[i]) and irr[i] >= threshold + and not np.isnan(sza[i]) and sza[i] > 90.0): + after_ok = all( + (np.isnan(v) or v < threshold) + for v in irr[i+1:min(n, i+4)] + ) + if after_ok: + dusk_idx = i + break + + for idx, event_type in [(dawn_idx, "dawn"), (dusk_idx, "dusk")]: + if idx is None: + continue + sza_val = float(sza[idx]) + if sza_val <= 90.0: + continue + dep = round(sza_val - 90.0, 4) + if dep < 0.3 or dep > 20.0: + continue + utc_ts = pd.Timestamp(ts[idx]) + irr_val = float(irr[idx]) + ghi_at = float(grp["ghi"].iloc[idx]) if "ghi" in grp.columns else float("nan") + dif_at = float(grp["dif"].iloc[idx]) if "dif" in grp.columns else float("nan") + + events.append({ + "date": str(day), + "event_type": event_type, + "utc_time": utc_ts.strftime("%H:%M:%S"), + "sza_deg": round(sza_val, 4), + "solar_depression_deg": dep, + "irr_wm2": round(irr_val, 2), + "ghi_wm2": round(ghi_at, 2) if not math.isnan(ghi_at) else None, + "dif_wm2": round(dif_at, 2) if not math.isnan(dif_at) else None, + "threshold_wm2": threshold, + "lat": lat, + "lon": lon, + "station_code": station_code, + "station_name": station_name, + "source": source, + }) + + return events + + +def process_station(code: str, meta: dict, years: list[int]) -> list[dict]: + print(f"\n {meta['name']} ({code}) lat={meta['lat']} lon={meta['lon']}") + dir_code = meta["dir"] + dest_root = RAW_BSRN / "surfrad" / dir_code + dest_root.mkdir(parents=True, exist_ok=True) + + all_events = [] + for year in years: + yy = str(year)[2:] # 2-digit year + # Build list of days + import calendar + days_in_year = 366 if calendar.isleap(year) else 365 + + dest_dir = dest_root / str(year) + dest_dir.mkdir(parents=True, exist_ok=True) + + day_dfs = [] + success_count = 0 + for doy in range(1, days_in_year + 1): + fname = f"{dir_code}{yy}{doy:03d}.dat" + dest = dest_dir / fname + url = f"{SURFRAD_BASE}/{dir_code}/{year}/{fname}" + + ok = download_file(url, dest) + if not ok: + continue + with open(dest, "rb") as f: + day_df = parse_surfrad_dat(f.read()) + if not day_df.empty: + day_dfs.append(day_df) + success_count += 1 + + if not day_dfs: + print(f" {year}: 0 days downloaded") + continue + + df_year = pd.concat(day_dfs, ignore_index=True) + evts = extract_events(df_year, meta["lat"], meta["lon"], + code, meta["name"], f"surfrad_{year}") + print(f" {year}: {success_count} days → {len(evts)} twilight events " + f"({len(df_year):,} 1-min records)") + all_events.extend(evts) + + return all_events + + +def main(): + print("SURFRAD Twilight Extractor") + years = [2018, 2019, 2020] + print(f"Years: {years}") + print(f"Stations: {list(STATIONS.keys())}") + + all_events = [] + for code, meta in STATIONS.items(): + evts = process_station(code, meta, years) + all_events.extend(evts) + print(f" → {len(evts)} total events for {code}") + + # Save + out = RAW_SIGHTINGS / "surfrad_twilight.csv" + if all_events: + df = pd.DataFrame(all_events) + df = df.sort_values(["station_code", "date", "event_type", "threshold_wm2"]) + df.to_csv(out, index=False) + print(f"\nSaved {len(df):,} rows → {out}") + + prim = df[df["threshold_wm2"] == 1.0] + print(f"\nSUMMARY (1.0 W/m² threshold):") + print(f" Total: {len(prim):,} events") + for evt in ["dawn", "dusk"]: + sub = prim[prim["event_type"] == evt] + if sub.empty: + continue + d = sub["solar_depression_deg"] + print(f" {evt.upper()}: {len(sub):,} | " + f"mean={d.mean():.2f}° p10={d.quantile(0.1):.2f}° " + f"p90={d.quantile(0.9):.2f}°") + else: + print("No events extracted.") + + # Merge into bsrn_all_twilight.csv + all_file = RAW_SIGHTINGS / "bsrn_all_twilight.csv" + if all_file.exists() and all_events: + df_existing = pd.read_csv(all_file) + # Remove old surfrad rows if any + df_existing = df_existing[~df_existing["source"].str.startswith("surfrad", na=False)] + df_new = pd.concat([df_existing, pd.DataFrame(all_events)], ignore_index=True) + df_new = df_new.sort_values(["station_code", "date", "event_type", "threshold_wm2"]) + df_new.to_csv(all_file, index=False) + print(f"\nUpdated bsrn_all_twilight.csv → {len(df_new):,} total rows") + + print("\nDone.") + + +if __name__ == "__main__": + main() diff --git a/src/surfrad_process2.py b/src/surfrad_process2.py new file mode 100644 index 0000000..307258c --- /dev/null +++ b/src/surfrad_process2.py @@ -0,0 +1,204 @@ +""" +SURFRAD processor — reads already-downloaded files in DRA/ and TBL/ dirs. +Also downloads remaining missing days. +""" + +import math +import requests +import pandas as pd +import numpy as np +import calendar +from datetime import datetime, timezone, timedelta +from pathlib import Path + +BASE = Path("/Volumes/X9/Sites/acamarata/pray-calc-ml") +RAW_BSRN = BASE / "data/raw/bsrn" +RAW_SIGHTINGS = BASE / "data/raw/raw_sightings" +THRESHOLDS = [0.5, 1.0, 2.0, 5.0, 10.0] +SURFRAD_BASE = "https://gml.noaa.gov/aftp/data/radiation/surfrad" + +STATIONS = { + "dra": {"name": "Desert Rock", "lat": 36.626, "lon": -116.018, "dir_code": "DRA", "file_prefix": "dra"}, + "tbl": {"name": "TableMountain", "lat": 40.125, "lon": -105.237, "dir_code": "TBL", "file_prefix": "tbl"}, +} +YEARS = [2018, 2019, 2020] + + +def download_file(url: str, dest: Path) -> bool: + if dest.exists() and dest.stat().st_size > 200: + return True + try: + r = requests.get(url, timeout=30) + if r.status_code != 200: + return False + dest.write_bytes(r.content) + return True + except Exception: + return False + + +def parse_surfrad_dat(fpath: Path) -> pd.DataFrame: + lines = fpath.read_bytes().decode("utf-8", errors="replace").splitlines() + records = [] + for line in lines[2:]: + parts = line.split() + if len(parts) < 13: + continue + try: + yr = int(parts[0]) + jday = int(parts[1]) + hr = int(parts[4]) + mn = int(parts[5]) + zen = float(parts[7]) + ghi = float(parts[8]) + dif = float(parts[12]) + dt = datetime(yr, 1, 1, hr, mn, tzinfo=timezone.utc) + timedelta(days=jday - 1) + records.append({ + "times_utc": dt, + "sza": None if zen < -900 else zen, + "ghi": None if ghi < -900 else max(0.0, ghi), + "dif": None if dif < -900 else max(0.0, dif), + }) + except (ValueError, IndexError): + continue + return pd.DataFrame(records) + + +def extract_events(df, lat, lon, code, name, source): + events = [] + df = df.copy() + df["times_utc"] = pd.to_datetime(df["times_utc"], utc=True, errors="coerce") + df = df.dropna(subset=["times_utc"]).sort_values("times_utc").reset_index(drop=True) + df["irr"] = pd.to_numeric(df.get("dif", pd.Series(dtype=float)), errors="coerce") + if "ghi" in df.columns: + df["irr"] = df["irr"].combine_first(pd.to_numeric(df["ghi"], errors="coerce")) + df["sza"] = pd.to_numeric(df["sza"], errors="coerce") + df["date"] = df["times_utc"].dt.date + + for day, grp in df.groupby("date"): + grp = grp.sort_values("times_utc").reset_index(drop=True) + irr = grp["irr"].values + sza = grp["sza"].values + ts = grp["times_utc"].values + n = len(irr) + if n < 10: + continue + for threshold in THRESHOLDS: + dawn_idx = None + for i in range(2, n): + if (not np.isnan(irr[i]) and irr[i] >= threshold + and not np.isnan(sza[i]) and sza[i] > 90.0 + and all((np.isnan(v) or v < threshold) for v in irr[max(0,i-3):i])): + dawn_idx = i + break + dusk_idx = None + for i in range(n-3, -1, -1): + if (not np.isnan(irr[i]) and irr[i] >= threshold + and not np.isnan(sza[i]) and sza[i] > 90.0 + and all((np.isnan(v) or v < threshold) for v in irr[i+1:min(n,i+4)])): + dusk_idx = i + break + for idx, etype in [(dawn_idx, "dawn"), (dusk_idx, "dusk")]: + if idx is None: + continue + sza_val = float(sza[idx]) + if sza_val <= 90.0: + continue + dep = round(sza_val - 90.0, 4) + if dep < 0.3 or dep > 20.0: + continue + utc_ts = pd.Timestamp(ts[idx]) + ghi_at = float(grp["ghi"].iloc[idx]) if "ghi" in grp.columns else float("nan") + dif_at = float(grp["dif"].iloc[idx]) if "dif" in grp.columns else float("nan") + events.append({ + "date": str(day), "event_type": etype, + "utc_time": utc_ts.strftime("%H:%M:%S"), + "sza_deg": round(sza_val, 4), + "solar_depression_deg": dep, + "irr_wm2": round(float(irr[idx]), 2), + "ghi_wm2": round(ghi_at, 2) if not math.isnan(ghi_at) else None, + "dif_wm2": round(dif_at, 2) if not math.isnan(dif_at) else None, + "threshold_wm2": threshold, + "lat": lat, "lon": lon, + "station_code": code, "station_name": name, "source": source, + }) + return events + + +def main(): + print("SURFRAD Processor v2 — using cached downloads + filling gaps") + all_events = [] + + for code, meta in STATIONS.items(): + print(f"\n{meta['name']} ({code})") + dir_code = meta["dir_code"] + prefix = meta["file_prefix"] + dest_root = RAW_BSRN / "surfrad" / dir_code + + for year in YEARS: + yy = str(year)[2:] + days_in_year = 366 if calendar.isleap(year) else 365 + dest_dir = dest_root / str(year) + dest_dir.mkdir(parents=True, exist_ok=True) + + # Download any missing days + missing = 0 + for doy in range(1, days_in_year + 1): + fname = f"{prefix}{yy}{doy:03d}.dat" + dest = dest_dir / fname + if dest.exists() and dest.stat().st_size > 200: + continue + url = f"{SURFRAD_BASE}/{prefix}/{year}/{fname}" + if download_file(url, dest): + missing += 1 + + # Parse all available files + day_dfs = [] + for fpath in sorted(dest_dir.glob(f"{prefix}{yy}*.dat")): + day_df = parse_surfrad_dat(fpath) + if not day_df.empty: + day_dfs.append(day_df) + + if not day_dfs: + print(f" {year}: no data files found") + continue + + df_year = pd.concat(day_dfs, ignore_index=True) + evts = extract_events(df_year, meta["lat"], meta["lon"], + code, meta["name"], f"surfrad_{year}") + print(f" {year}: {len(day_dfs)} days, {len(df_year):,} records → {len(evts)} events (downloaded {missing} new files)") + all_events.extend(evts) + + out = RAW_SIGHTINGS / "surfrad_twilight.csv" + if all_events: + df = pd.DataFrame(all_events).sort_values( + ["station_code", "date", "event_type", "threshold_wm2"]) + df.to_csv(out, index=False) + print(f"\nSaved {len(df):,} rows → {out}") + + prim = df[df["threshold_wm2"] == 1.0] + print(f"\nSummary at 1 W/m²: {len(prim):,} events") + for etype in ["dawn", "dusk"]: + sub = prim[prim["event_type"] == etype] + if not sub.empty: + d = sub["solar_depression_deg"] + print(f" {etype}: {len(sub):,} | mean={d.mean():.2f}° " + f"p10={d.quantile(0.1):.2f}° p90={d.quantile(0.9):.2f}°") + + # Update combined file + combined = RAW_SIGHTINGS / "bsrn_all_twilight.csv" + if combined.exists(): + df_all = pd.read_csv(combined) + df_all = df_all[~df_all["source"].str.startswith("surfrad", na=False)] + df_all = pd.concat([df_all, pd.DataFrame(all_events)], ignore_index=True) + df_all = df_all.sort_values(["station_code", "date", "event_type", "threshold_wm2"]) + df_all.to_csv(combined, index=False) + print(f"Updated bsrn_all_twilight.csv → {len(df_all):,} total rows") + else: + print("No events.") + + print("\nDone.") + + +if __name__ == "__main__": + main()