chore: add remaining processors and analysis scripts, gitignore experimental

Tracked: BSRN/SURFRAD processors (reference, excluded from pipeline),
GaN-MN downloader, academic paper fetcher, Madrid SQM processor,
ML analysis scripts (src/analyze/), umsu_medan_2024 raw sightings.

Gitignored: global_extrapolator, instant_1m_injector/vectorized,
massive_harvest_engine, massive_sqm_downloader, global_sqm_harvester,
run_infinite_pipeline.sh, run_massive_collection.sh, search_papers.py
(agent-generated experimental scripts, not part of core pipeline).
This commit is contained in:
Aric Camarata 2026-03-23 06:44:01 -04:00
parent ad8f1c4bc7
commit 3b8c665aca
17 changed files with 2863 additions and 0 deletions

9
.gitignore vendored
View file

@ -93,4 +93,13 @@ src/collect/openalex_harvester.py
src/collect/source_tracker.py
src/collect/waktusolat.py
src/collect/web_harvester.py
src/collect/global_extrapolator.py
src/collect/instant_1m_injector.py
src/collect/instant_1m_vectorized.py
src/collect/massive_harvest_engine.py
src/collect/massive_sqm_downloader.py
src/collect/global_sqm_harvester.py
run_infinite_pipeline.sh
run_massive_collection.sh
search_papers.py
.vscode/*

View file

@ -0,0 +1,4 @@
prayer,date_local,time_local,utc_offset,lat,lng,elevation_m,source,notes
fajr,2023-06-21,05:12,7.0,3.595,98.672,22.0,"UMSU Medan Observatory 2023-2024 SQM Data","SQM confirmed Fajar Sadiq; published 2025"
fajr,2023-12-21,05:22,7.0,3.595,98.672,22.0,"UMSU Medan Observatory 2023-2024 SQM Data","SQM confirmed Fajar Sadiq; winter 2023"
fajr,2024-03-20,05:16,7.0,3.595,98.672,22.0,"UMSU Medan Observatory 2023-2024 SQM Data","SQM confirmed Fajar Sadiq; spring equinox 2024"
1 prayer date_local time_local utc_offset lat lng elevation_m source notes
2 fajr 2023-06-21 05:12 7.0 3.595 98.672 22.0 UMSU Medan Observatory 2023-2024 SQM Data SQM confirmed Fajar Sadiq; published 2025
3 fajr 2023-12-21 05:22 7.0 3.595 98.672 22.0 UMSU Medan Observatory 2023-2024 SQM Data SQM confirmed Fajar Sadiq; winter 2023
4 fajr 2024-03-20 05:16 7.0 3.595 98.672 22.0 UMSU Medan Observatory 2023-2024 SQM Data SQM confirmed Fajar Sadiq; spring equinox 2024

57
src/analyze/compare.py Normal file
View file

@ -0,0 +1,57 @@
import pandas as pd
import numpy as np
import os
def dpc_algorithm(row, prayer):
# DPC strictly enforces 18.0 degrees for both
return 18.0
def _ms_fajr(lat):
# Approximation of Khalid Shaukat moonsighting algorithm curve
# 18 at equator, dropping incrementally past 30 deg to ~12.0
abslat = abs(lat)
if abslat <= 30:
return 18.0
else:
# Linear drop from 18.0 at 30 deg down to 12.0 at 55 deg
val = 18.0 - ((abslat - 30) * (6.0/25.0))
return max(10.0, val)
def ms_algorithm(row, prayer):
if prayer == "fajr":
return _ms_fajr(row['lat'])
else:
# Isha is slightly more forgiving or static
return _ms_fajr(row['lat'])
def run_evaluation(df, prayer):
print(f"\n>> Algorithm Benchmark: {prayer.upper()} ({len(df)} Records)")
empirical = df[f'{prayer}_angle'].values
dpc_preds = np.array([dpc_algorithm(row, prayer) for _, row in df.iterrows()])
ms_preds = np.array([ms_algorithm(row, prayer) for _, row in df.iterrows()])
dpc_mae = np.mean(np.abs(empirical - dpc_preds))
dpc_rmse = np.sqrt(np.mean((empirical - dpc_preds)**2))
ms_mae = np.mean(np.abs(empirical - ms_preds))
ms_rmse = np.sqrt(np.mean((empirical - ms_preds)**2))
print(f"DPC (18/18 static) -> MAE: {dpc_mae:.3f}°, RMSE: {dpc_rmse:.3f}°")
print(f"Moonsighting (Variable) -> MAE: {ms_mae:.3f}°, RMSE: {ms_rmse:.3f}°")
best = "DPC" if dpc_mae < ms_mae else "Moonsighting Variant"
print(f"WINNER: {best} predicts the physical reality more accurately.")
def main():
base_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
fajr_path = os.path.join(base_dir, "data", "processed", "fajr_angles.csv")
isha_path = os.path.join(base_dir, "data", "processed", "isha_angles.csv")
if os.path.exists(fajr_path):
run_evaluation(pd.read_csv(fajr_path), "fajr")
if os.path.exists(isha_path):
run_evaluation(pd.read_csv(isha_path), "isha")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,99 @@
import pandas as pd
import numpy as np
import warnings
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.pipeline import make_pipeline
warnings.filterwarnings("ignore")
def find_best_formula(df, name):
print(f"\n=======================================================")
print(f" {name.upper()} EXHAUSTIVE ML FORMULA SEARCH ")
print(f"=======================================================")
# Filter missing values
angle_col = f"{name.lower()}_angle"
df = df.dropna(subset=[angle_col, 'lat', 'day_of_year', 'elevation_m'])
# Target and Features restricted to Mobile-App-Available variables
y = df[angle_col].values
# Features
lat_abs = df['lat'].abs().values
elev = df['elevation_m'].values
doy = df['day_of_year'].values
doy_sin = np.sin(2 * np.pi * doy / 365.25)
doy_cos = np.cos(2 * np.pi * doy / 365.25)
X = np.column_stack((lat_abs, doy_sin, doy_cos, elev))
feature_names = ['Lat', 'Season_Sin', 'Season_Cos', 'Elevation']
print(f"Dataset Size: {len(y)} verified rows")
print(f"Base Average (15° model): MAE = {mean_absolute_error(y, np.full_like(y, 15.0)):.3f}°\n")
# --- 1. RANDOM FOREST (The Black Box Upper Bound) ---
# This tells us the ABSOLUTE maximum achievable accuracy if we had a perfect infinite formula
rf = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
rf.fit(X, y)
rf_preds = rf.predict(X)
print(f"[1] Random Forest (Non-Linear Black Box)")
print(f" R2 = {r2_score(y, rf_preds):.4f} | MAE = {mean_absolute_error(y, rf_preds):.4f}°")
print(f" Feature Importance: Lat={rf.feature_importances_[0]:.2f}, Season_Sin={rf.feature_importances_[1]:.2f}, Season_Cos={rf.feature_importances_[2]:.2f}, Elev={rf.feature_importances_[3]:.2f}\n")
# --- 2. GRADIENT BOOSTING (Sequential Error Correction) ---
gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
gb.fit(X, y)
gb_preds = gb.predict(X)
print(f"[2] Gradient Boosting (XGBoost logic)")
print(f" R2 = {r2_score(y, gb_preds):.4f} | MAE = {mean_absolute_error(y, gb_preds):.4f}°\n")
# --- 3. POLYNOMIAL REGRESSION (Degree 2) - The "Clean Formula" ---
# This gives us a highly complex mathematical equation we can actually deploy in a mobile app
poly2 = PolynomialFeatures(degree=2, include_bias=False)
X_poly2 = poly2.fit_transform(X)
lr2 = Ridge(alpha=1.0).fit(X_poly2, y)
poly2_preds = lr2.predict(X_poly2)
print(f"[3] 2nd Degree Polynomial Formula (Clean Math Equation)")
print(f" R2 = {r2_score(y, poly2_preds):.4f} | MAE = {mean_absolute_error(y, poly2_preds):.4f}°")
# --- 4. POLYNOMIAL REGRESSION (Degree 3) - The "Advanced Clean Formula" ---
poly3 = PolynomialFeatures(degree=3, include_bias=False)
X_poly3 = poly3.fit_transform(X)
lr3 = Ridge(alpha=5.0).fit(X_poly3, y)
poly3_preds = lr3.predict(X_poly3)
print(f"[4] 3rd Degree Polynomial Formula (Advanced Math Equation)")
print(f" R2 = {r2_score(y, poly3_preds):.4f} | MAE = {mean_absolute_error(y, poly3_preds):.4f}°\n")
# Output the BEST mathematical formula (We'll use Degree 2 for brevity and readability)
print(f"--- BEST EXTRACTABLE EQUATION (Polynomial Degree 2) ---")
intercept = lr2.intercept_
coefs = lr2.coef_
poly_names = poly2.get_feature_names_out(feature_names)
equation_str = f"Angle = {intercept:.4f}\n"
for name, c in zip(poly_names, coefs):
if abs(c) > 0.0001: # Only show meaningful weights
sign = "+" if c > 0 else "-"
# Format nicely
name = name.replace(" ", " * ")
equation_str += f" {sign} {abs(c):.4f} * ({name})\n"
print(equation_str)
if __name__ == '__main__':
try:
fajr_df = pd.read_csv('data/processed/fajr_angles.csv')
find_best_formula(fajr_df, 'Fajr')
except Exception as e:
print("Error Fajr:", e)
try:
isha_df = pd.read_csv('data/processed/isha_angles.csv')
find_best_formula(isha_df, 'Isha')
except Exception as e:
print("Error Isha:", e)

View file

@ -0,0 +1,98 @@
import pandas as pd
import numpy as np
from sklearn.linear_model import RidgeCV, LassoCV, LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')
def find_perfect_formula(prayer_name, csv_file):
print(f"\n{'='*60}")
print(f" ALGEBRAIC FORMULA EXTRACTION: {prayer_name.upper()}")
print(f"{'='*60}")
df = pd.read_csv(csv_file)
target = f"{prayer_name}_angle"
df = df.dropna(subset=['lat', 'day_of_year', 'elevation_m', target])
# Base readily available 'pray-calc' variables
df['lat_abs'] = df['lat'].abs()
df['lat_sq'] = df['lat'] ** 2
df['elev_sqrt'] = np.sqrt(df['elevation_m'].abs())
# Sine/Cosine for Day of Year (Seasonality cyclic mapping)
# The longest twilight is around solstice, shortest around equinox,
# so we use a 1-year frequency harmonic.
df['doy_sin'] = np.sin(2 * np.pi * df['day_of_year'] / 365.25)
df['doy_cos'] = np.cos(2 * np.pi * df['day_of_year'] / 365.25)
# Interaction
df['lat_x_season'] = df['lat_abs'] * df['doy_cos']
features = ['lat_abs', 'lat_sq', 'elev_sqrt', 'doy_sin', 'doy_cos', 'lat_x_season']
X = df[features]
y = df[target]
# 1. Standardize (just to see which features matter most)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Use Lasso to zero-out useless variables (Sparsity for a "clean formula")
lasso = LassoCV(cv=5)
lasso.fit(X_scaled, y)
# Filter features that survived Lasso
surviving_idx = [i for i, coef in enumerate(lasso.coef_) if abs(coef) > 0.001]
if not surviving_idx: # Fallback if everything zeroed out
surviving_idx = [0, 1]
surviving_features = [features[i] for i in surviving_idx]
# 2. Refit Linear Regression on strictly the surviving features natively (unscaled) for exact formula constants!
X_clean = df[surviving_features]
lr = LinearRegression()
lr.fit(X_clean, y)
y_pred = lr.predict(X_clean)
r2 = r2_score(y, y_pred)
mae = mean_absolute_error(y, y_pred)
print(f"Algorithm Selected: Clean Sparse Polynomial")
print(f"Targeting: Latitude, Seasonality Harmonics, Elevation")
print(f"Mathematical Accuracy: MAE = {mae:.3f} degrees | R^2 = {r2:.4f}")
print("\n--- THE PERFECT ALGEBRAIC FORMULA ---")
formula = f"D0_base = {lr.intercept_:.4f}"
for feat, coef in zip(surviving_features, lr.coef_):
sign = "+" if coef >= 0 else "-"
formula += f"\n {sign} {abs(coef):.6f} * {feat}"
print(formula)
print("\n--- COPY-PASTE PYTHON IMPLEMENTATION ---")
print(f"def get_{prayer_name}_angle(lat, day_of_year, elevation_m):")
print(f" import numpy as np")
print(f" import math")
print(f" lat_abs = abs(lat)")
if 'lat_sq' in surviving_features:
print(f" lat_sq = lat ** 2")
if 'elev_sqrt' in surviving_features:
print(f" elev_sqrt = math.sqrt(max(0, elevation_m))")
if 'doy_sin' in surviving_features:
print(f" doy_sin = math.sin(2 * math.pi * day_of_year / 365.25)")
if 'doy_cos' in surviving_features:
print(f" doy_cos = math.cos(2 * math.pi * day_of_year / 365.25)")
if 'lat_x_season' in surviving_features:
print(f" lat_x_season = lat_abs * doy_cos")
eq = f" return {lr.intercept_:.4f}"
for feat, coef in zip(surviving_features, lr.coef_):
sign = "+" if coef >= 0 else "-"
eq += f" {sign} ({abs(coef):.6f} * {feat})"
print(eq)
print("="*60)
if __name__ == '__main__':
find_perfect_formula("fajr", "data/processed/fajr_angles.csv")
find_perfect_formula("isha", "data/processed/isha_angles.csv")

View file

@ -0,0 +1,73 @@
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
import warnings
warnings.filterwarnings('ignore')
def analyze_prayer(prayer_name, csv_file):
print(f"\n{'='*50}")
print(f" ML Analysis: {prayer_name.upper()}")
print(f"{'='*50}")
df = pd.read_csv(csv_file)
target = f"{prayer_name}_angle"
# Drop NaNs
df = df.dropna(subset=['lat', 'day_of_year', target])
X = df[['lat', 'day_of_year']]
y = df[target]
print(f"Total Rows: {len(df)}")
# 1. Standard Linear Regression
lr = LinearRegression()
lr.fit(X, y)
lr_pred = lr.predict(X)
lr_r2 = r2_score(y, lr_pred)
lr_mae = mean_absolute_error(y, lr_pred)
print("\n--- Model 1: Linear Regression ---")
print(f"R2: {lr_r2:.4f} | MAE: {lr_mae:.4f} degrees")
print(f"Formula: Angle = {lr.intercept_:.4f} + ({lr.coef_[0]:.4f} * lat) + ({lr.coef_[1]:.4f} * day_of_year)")
# 2. Polynomial Regression (Degree 2 - captures exponential/quadratic non-linear relationships)
poly = make_pipeline(PolynomialFeatures(degree=2), LinearRegression())
poly.fit(X, y)
poly_pred = poly.predict(X)
poly_r2 = r2_score(y, poly_pred)
poly_mae = mean_absolute_error(y, poly_pred)
print("\n--- Model 2: Polynomial (Non-Linear) Regression (Degree 2) ---")
print(f"R2: {poly_r2:.4f} | MAE: {poly_mae:.4f} degrees")
# Check if non-linear significantly outperforms linear
if poly_r2 > lr_r2 + 0.05:
print("-> CONCLUSION: The relationship is strongly non-linear/exponential.")
else:
print("-> CONCLUSION: The relationship is predominantly linear, with minimal exponential curvature.")
# 3. Random Forest (Highly Non-Linear Trees)
rf = RandomForestRegressor(n_estimators=50, max_depth=5, random_state=42)
rf.fit(X, y)
rf_pred = rf.predict(X)
rf_r2 = r2_score(y, rf_pred)
rf_mae = mean_absolute_error(y, rf_pred)
print("\n--- Model 3: Random Forest (Non-linear Tree Ensembles) ---")
print(f"R2: {rf_r2:.4f} | MAE: {rf_mae:.4f} degrees")
# DPC/Moonsighting baseline simulated evaluation
# Moonsighting historically uses dynamic formulas incorporating Latitude and Season (TOY)
print("\n--- Feature Importance (Random Forest) ---")
importance = rf.feature_importances_
print(f"Latitude drives {importance[0]*100:.1f}% of the variance.")
print(f"Day of Year (Season) drives {importance[1]*100:.1f}% of the variance.")
if __name__ == '__main__':
analyze_prayer("fajr", "data/processed/fajr_angles.csv")
analyze_prayer("isha", "data/processed/isha_angles.csv")

View file

@ -0,0 +1,74 @@
import pandas as pd
import numpy as np
import os
import sys
def evaluate_models(df, prayer):
print(f"\n{'='*50}\nML Feature Discovery: {prayer.upper()} ({len(df)} records)\n{'='*50}")
if len(df) == 0:
return
df['sin_toy'] = np.sin(2 * np.pi * df['day_of_year'] / 365.25)
df['cos_toy'] = np.cos(2 * np.pi * df['day_of_year'] / 365.25)
# 1. Correlation Matrix
cols = ['lat', 'elevation_m', 'day_of_year', 'sin_toy', 'cos_toy', f'{prayer}_angle']
corr = df[cols].corr()
print("\n--- Pearson Correlation Matrix ---")
print(corr[f'{prayer}_angle'].sort_values(ascending=False).to_string())
# 2. Linear vs Polynomial (Exponential/Curved) Evaluation
# Testing Latitude relationship
lat_val = df['lat'].values
angle_val = df[f'{prayer}_angle'].values
# Linear Fit: y = ax + b
p_lin = np.polyfit(lat_val, angle_val, 1)
y_lin = np.polyval(p_lin, lat_val)
ss_res_lin = np.sum((angle_val - y_lin) ** 2)
ss_tot = np.sum((angle_val - np.mean(angle_val)) ** 2)
r2_lin = 1 - (ss_res_lin / ss_tot)
# Polynomial Fit (Degree 3): y = ax^3 + bx^2 + cx + d
p_poly = np.polyfit(lat_val, angle_val, 3)
y_poly = np.polyval(p_poly, lat_val)
ss_res_poly = np.sum((angle_val - y_poly) ** 2)
r2_poly = 1 - (ss_res_poly / ss_tot)
print("\n--- Linear vs Exponential (Polynomial) Latitude Dependency ---")
print(f"Linear Fit R²: {r2_lin:.4f}")
print(f"Polynomial Fit R²: {r2_poly:.4f}")
if r2_poly > r2_lin + 0.05:
print(">> OBSERVATION: The relationship with Latitude is highly EXPONENTIAL/NON-LINEAR.")
else:
print(">> OBSERVATION: The relationship with Latitude is mostly LINEAR or very weak overall.")
# 3. Monthly / Seasonal Aggregation
df['month'] = pd.to_datetime(df['date']).dt.month
monthly_mean = df.groupby('month')[f'{prayer}_angle'].mean()
print("\n--- Seasonal (Time of Year) Impact ---")
print("Mean Angle by Month:")
print(monthly_mean.to_string())
season_variance = monthly_mean.max() - monthly_mean.min()
print(f">> OBSERVATION: Seasonal fluctuation variance is {season_variance:.2f}°")
def main():
base_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
fajr_path = os.path.join(base_dir, "data", "processed", "fajr_angles.csv")
isha_path = os.path.join(base_dir, "data", "processed", "isha_angles.csv")
try:
fajr_df = pd.read_csv(fajr_path)
evaluate_models(fajr_df, "fajr")
except Exception as e:
print(f"Fajr parse error: {e}")
try:
isha_df = pd.read_csv(isha_path)
evaluate_models(isha_df, "isha")
except Exception as e:
print(f"Isha parse error: {e}")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,138 @@
import pandas as pd
import numpy as np
from scipy.optimize import curve_fit
import os
# ---------------------------------------------------------
# Candidate Biological/Astronomical Formula Definitions
# x is a 2D array: x[0] = abs(lat), x[1] = elevation, x[2] = doy
# ---------------------------------------------------------
def eq1_static(x, a):
return np.full_like(x[0], a)
def eq2_lin_lat(x, a, b):
return a + b * x[0]
def eq3_quad_lat(x, a, b, c):
return a + b * x[0] + c * (x[0]**2)
def eq4_lat_season(x, a, b, c, d):
# c is amplitude, d is phase shift
return a + b * x[0] + c * np.cos(2 * np.pi * (x[2] - d) / 365.25)
def eq5_elev_lat_season(x, a, b, c, d, e):
return a + b * x[0] + c * x[1] + d * np.cos(2 * np.pi * (x[2] - e) / 365.25)
def eq6_lat_dependent_season(x, a, b, c, d):
# Amplitude of season scales with latitude
return a + b * x[0] + c * x[0] * np.cos(2 * np.pi * (x[2] - d) / 365.25)
def eq7_the_beautiful_match(x, a, b, c, d, e, f):
# The ultimate clean parametric equation
# Base + A*Lat + B*Lat^2 + C*Elev + D*Lat*Season + E*Season
lat = x[0]
elev = x[1]
season = np.cos(2 * np.pi * (x[2] - f) / 365.25)
return a + b * lat + c * (lat**2) + d * elev + e * lat * season
# Map functions to their string representations for readable output
FORMULAS = [
("Static Baseline", eq1_static, ["Base"]),
("Linear Latitude", eq2_lin_lat, ["Base", "Lat_Coeff"]),
("Quadratic Latitude", eq3_quad_lat, ["Base", "Lat_Coeff", "Lat2_Coeff"]),
("Latitude + Season", eq4_lat_season, ["Base", "Lat_Coeff", "Season_Amp", "Phase_Shift"]),
("Elevation + Lat + Season", eq5_elev_lat_season, ["Base", "Lat_Coeff", "Elev_Coeff", "Season_Amp", "Phase_Shift"]),
("Lat-Dependent Season", eq6_lat_dependent_season, ["Base", "Lat_Coeff", "Season_Lat_Scale", "Phase_Shift"]),
("The Beautiful Match", eq7_the_beautiful_match, ["Base", "Lat_C", "Lat2_C", "Elev_C", "LatXSeason_C", "Phase_Shift"])
]
def analyze_formula(df, prayer):
print(f"\n{'='*70}\n[ {prayer.upper()} ALGORITHM DISCOVERY ] - {len(df)} Records\n{'='*70}")
# Prepare input vectors
lat_arr = np.abs(df['lat'].values)
elev_arr = df['elevation_m'].values
doy_arr = df['day_of_year'].values
y = df[f'{prayer}_angle'].values
x = np.vstack((lat_arr, elev_arr, doy_arr))
best_mae = float('inf')
best_name = None
best_params = None
best_popt = None
best_func = None
for name, func, param_names in FORMULAS:
# Initial guess (15.0 for base, 0 for others)
p0 = [15.0] + [0.0] * (len(param_names)-1)
# Bounding Phase_Shift safely if it exists (usually the last param if 'Phase_Shift' is there)
bounds = (-np.inf, np.inf)
try:
popt, pcov = curve_fit(func, x, y, p0=p0, maxfev=10000)
y_pred = func(x, *popt)
mae = np.mean(np.abs(y - y_pred))
rmse = np.sqrt(np.mean((y - y_pred)**2))
print(f"--- {name} ---")
print(f"MAE: {mae:.4f}° | RMSE: {rmse:.4f}°")
params_str = ", ".join([f"{n}={v:.5f}" for n, v in zip(param_names, popt)])
print(f"Parameters: {params_str}")
print()
if mae < best_mae:
best_mae = mae
best_name = name
best_params = params_str
best_popt = popt
best_func = func
except Exception as e:
print(f"Failed to fit {name}: {e}")
print(f"\n>> 🏆 WINNING FORMULA FOR {prayer.upper()}: {best_name}")
print(f">> BEST MAE: {best_mae:.4f}°")
print(f">> CONSTANTS: {best_params}")
# Construct the human-readable clean mathematical equation
if best_name == "The Beautiful Match":
a, b, c, d, e, f = best_popt
eq_str = (
f"Angle = {a:.4f} "
f"{'+' if b>=0 else '-'} {abs(b):.5f} * abs(Lat) "
f"{'+' if c>=0 else '-'} {abs(c):.5f} * Lat² "
f"{'+' if d>=0 else '-'} {abs(d):.5f} * Elevation "
f"{'+' if e>=0 else '-'} {abs(e):.5f} * abs(Lat) * cos(2π * (DayOfYear - {f:.1f}) / 365)"
)
print(f"\n✨ THE CLEAN FORMULA ✨\n{eq_str}\n")
elif best_name == "Lat-Dependent Season":
a, b, c, d = best_popt
eq_str = (
f"Angle = {a:.4f} "
f"{'+' if b>=0 else '-'} {abs(b):.5f} * abs(Lat) "
f"{'+' if c>=0 else '-'} {abs(c):.5f} * abs(Lat) * cos(2π * (DayOfYear - {d:.1f}) / 365)"
)
print(f"\n✨ THE CLEAN FORMULA ✨\n{eq_str}\n")
def main():
base_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
fajr_path = os.path.join(base_dir, "data", "processed", "fajr_angles.csv")
isha_path = os.path.join(base_dir, "data", "processed", "isha_angles.csv")
# 1. Clean outliers aggressively to find the mathematically purest underlying signal
# We will filter out angles < 10 and > 22 for absolute pure signal matching
if os.path.exists(fajr_path):
df = pd.read_csv(fajr_path)
df_clean = df[(df['fajr_angle'] >= 10.0) & (df['fajr_angle'] <= 20.0)]
analyze_formula(df_clean, "fajr")
if os.path.exists(isha_path):
df = pd.read_csv(isha_path)
df_clean = df[(df['isha_angle'] >= 11.0) & (df['isha_angle'] <= 21.0)]
analyze_formula(df_clean, "isha")
if __name__ == "__main__":
main()

1
src/analyze/praytimes.py Normal file
View file

@ -0,0 +1 @@
404: Not Found

View file

@ -0,0 +1,102 @@
import pandas as pd
import numpy as np
import warnings
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
from scipy.optimize import curve_fit
warnings.filterwarnings("ignore")
def exp_func(x, a, b, c):
return a * np.exp(b * x) + c
def eval_algorithms(df, name):
print(f"\n==========================================")
print(f" {name.upper()} ML PATTERN ANALYSIS ")
print(f"==========================================")
print(f"Total verified observations: {len(df)}")
# Check column name for angle
angle_col = f"{name.lower()}_angle"
if angle_col not in df.columns:
print(f"Error: {angle_col} not found in columns")
return
df['doy'] = df['day_of_year']
df['lat_abs'] = df['lat'].abs()
# Missing value cleaning just in case
df = df.dropna(subset=[angle_col, 'doy', 'lat_abs'])
# 1. Seasonality Feature Engineering (Sin/Cos mapping of Time of Year)
df['doy_sin'] = np.sin(2 * np.pi * df['doy'] / 365)
df['doy_cos'] = np.cos(2 * np.pi * df['doy'] / 365)
X_lat = df[['lat_abs']].values
X_all = df[['lat_abs', 'doy_sin', 'doy_cos']].values
y = df[angle_col].values
# --- A. BASELINE FIXED ALGORITHM (ISNA/MWL style: 15° or 18°) ---
print("\n[A] STATIC ALGORITHM BENCHMARKS (15° vs 18°)")
mae_15 = mean_absolute_error(y, np.full_like(y, 15.0))
mae_18 = mean_absolute_error(y, np.full_like(y, 18.0))
print(f" Fixed 15° Error (MAE): {mae_15:.3f}°")
print(f" Fixed 18° Error (MAE): {mae_18:.3f}°")
# --- B. LATITUDE ONLY (Linear vs Exponential) ---
print("\n[B] LATITUDE PATTERNS (Equator distance vs Angle)")
# Linear Latitude
lr_lat = LinearRegression().fit(X_lat, y)
y_pred_lat_lin = lr_lat.predict(X_lat)
print(f" Linear Latitude Fit : R2={r2_score(y, y_pred_lat_lin):.3f}, MAE={mean_absolute_error(y, y_pred_lat_lin):.3f}°")
print(f" -> Coefficient: {lr_lat.coef_[0]:.4f} degrees per 1° latitude move")
# Exponential Latitude
try:
# We model the depression angle dropping exponentially toward the poles
popt, _ = curve_fit(exp_func, df['lat_abs'], y, p0=[-1, 0.05, 18], maxfev=5000)
y_pred_lat_exp = exp_func(df['lat_abs'], *popt)
r2_exp = r2_score(y, y_pred_lat_exp)
mae_exp = mean_absolute_error(y, y_pred_lat_exp)
print(f" Exponential Latitude Fit : R2={r2_exp:.3f}, MAE={mae_exp:.3f}°")
except Exception as e:
print(f" Exponential Fit failed: {e}")
# --- C. TIME OF YEAR (SEASONALITY) IMPACT ---
print("\n[C] TIME OF YEAR (SEASONAL) CORRELATIONS")
lr_all = LinearRegression().fit(X_all, y)
y_pred_all = lr_all.predict(X_all)
print(f" Combined Model (Lat + Seasonality): R2={r2_score(y, y_pred_all):.3f}, MAE={mean_absolute_error(y, y_pred_all):.3f}°")
# Feature Importance
print(f" -> Lat Importance : {abs(lr_all.coef_[0]):.4f}")
print(f" -> TOY (Season) Sine: {abs(lr_all.coef_[1]):.4f} amplitude")
# --- D. DYNAMIC ALGORITHM COMPARISON ---
high_lat_mask = df['lat_abs'] > 40
if sum(high_lat_mask) > 0:
high_lat_mean = df[high_lat_mask][angle_col].mean()
low_lat_mean = df[~high_lat_mask][angle_col].mean()
print(f"\n[D] ALGORITHMIC COMPARISON")
print(f" Mean empirical angle at >40° Lat : {high_lat_mean:.2f}°")
print(f" Mean empirical angle at <=40° Lat : {low_lat_mean:.2f}°")
shift = low_lat_mean - high_lat_mean
print(f" Shift detected: {shift:.2f}° absolute reduction when moving towards poles.")
if shift > 0.5:
print(" => Matches Moonsighting/DPC dynamic logic (angles drop significantly at higher latitudes).")
else:
print(" => Does NOT strongly match Moonsighting dynamic logic.")
if __name__ == '__main__':
try:
fajr_df = pd.read_csv('data/processed/fajr_angles.csv')
eval_algorithms(fajr_df, 'Fajr')
except Exception as e:
print("Missing Fajr dataset:", e)
try:
isha_df = pd.read_csv('data/processed/isha_angles.csv')
eval_algorithms(isha_df, 'Isha')
except Exception as e:
print("Missing Isha dataset:", e)

671
src/bsrn_download.py Normal file
View file

@ -0,0 +1,671 @@
"""
BSRN Data Downloader and Twilight Extractor
Downloads BSRN irradiance data from:
1. Zenodo 5-site dataset (quick, 58.5 MB parquet)
2. CAELUS Zenodo dataset (54 stations, prioritizing 20-45°N)
For each station-year, extracts twilight events:
- Dawn: moment GHI first exceeds threshold after midnight
- Dusk: last moment GHI exceeds threshold before midnight
Computes solar depression angle at each event.
Outputs CSV rows suitable for pray-calc fajr/isha angle analysis.
"""
import os
import sys
import math
import zipfile
import requests
import pandas as pd
import numpy as np
from datetime import datetime, timezone, date, timedelta
from pathlib import Path
from io import BytesIO
# ── paths ────────────────────────────────────────────────────────────────────
BASE = Path("/Volumes/X9/Sites/acamarata/pray-calc-ml")
RAW_BSRN = BASE / "data/raw/bsrn"
RAW_SIGHTINGS = BASE / "data/raw/raw_sightings"
RAW_BSRN.mkdir(parents=True, exist_ok=True)
RAW_SIGHTINGS.mkdir(parents=True, exist_ok=True)
# ── thresholds ────────────────────────────────────────────────────────────────
# GHI threshold in W/m² to call "detectable irradiance"
# 0.5 W/m² ~ starlight. 1 W/m² is typical first-light threshold.
# We extract events at multiple thresholds.
THRESHOLDS = [0.5, 1.0, 2.0, 5.0, 10.0]
PRIMARY_THRESHOLD = 1.0 # used for the main sightings CSV
# ── station priority list (CAELUS dataset) ────────────────────────────────────
# Prioritize stations in 20-45°N, especially Muslim-majority/relevant regions
PRIORITY_STATIONS = {
"sbo": {"name": "Sede Boqer", "lat": 30.8597, "lon": 34.7794, "country": "Israel/Negev"},
"tam": {"name": "Tamanrasset", "lat": 22.7903, "lon": 5.5292, "country": "Algeria"},
"xia": {"name": "Xianghe", "lat": 39.754, "lon": 116.962, "country": "China"},
"gur": {"name": "Gurgaon", "lat": 28.4249, "lon": 77.156, "country": "India"},
"iza": {"name": "Izana", "lat": 28.3093, "lon": -16.4993, "country": "Spain/Canary"},
"ish": {"name": "Ishigakijima", "lat": 24.3367, "lon": 124.1644, "country": "Japan"},
"lln": {"name": "Lulin", "lat": 23.4686, "lon": 120.8736, "country": "Taiwan"},
"dra": {"name": "Desert Rock", "lat": 36.626, "lon": -116.018, "country": "USA/Nevada"},
"e13": {"name": "SouthernGreatPlains","lat": 36.605, "lon": -97.485, "country": "USA/Oklahoma"},
"tat": {"name": "Tateno", "lat": 36.0581, "lon": 140.1258, "country": "Japan"},
"fua": {"name": "Fukuoka", "lat": 33.5822, "lon": 130.3764, "country": "Japan"},
"gcr": {"name": "GoodwinCreek", "lat": 34.2547, "lon": -89.8729, "country": "USA/Mississippi"},
"bil": {"name": "Billings", "lat": 36.605, "lon": -97.516, "country": "USA/Oklahoma"},
}
# Years available in CAELUS per station (approximate; we'll try 2007-2017)
CAELUS_YEAR_RANGE = list(range(2007, 2018))
# ── solar geometry ────────────────────────────────────────────────────────────
def solar_declination(day_of_year: int) -> float:
"""Solar declination in degrees (Spencer 1971)."""
B = 2 * math.pi * (day_of_year - 1) / 365
return (180 / math.pi) * (
0.006918
- 0.399912 * math.cos(B)
+ 0.070257 * math.sin(B)
- 0.006758 * math.cos(2 * B)
+ 0.000907 * math.sin(2 * B)
- 0.002697 * math.cos(3 * B)
+ 0.00148 * math.sin(3 * B)
)
def equation_of_time_minutes(day_of_year: int) -> float:
"""Equation of time in minutes (Spencer 1971)."""
B = 2 * math.pi * (day_of_year - 1) / 365
return (180 / math.pi) * (
0.000075
+ 0.001868 * math.cos(B)
- 0.032077 * math.sin(B)
- 0.014615 * math.cos(2 * B)
- 0.04089 * math.sin(2 * B)
) * (180 / 15) # convert to minutes
def solar_hour_angle(utc_hour_frac: float, lon_deg: float, day_of_year: int) -> float:
"""Solar hour angle in degrees at given UTC fractional hour and longitude."""
eot = equation_of_time_minutes(day_of_year)
# Local solar time in hours
lst = utc_hour_frac + lon_deg / 15.0 + eot / 60.0
# Hour angle (0 at solar noon)
return 15.0 * (lst - 12.0)
def solar_elevation(lat_deg: float, lon_deg: float, utc_dt: datetime) -> float:
"""Solar elevation angle in degrees (positive above horizon)."""
doy = utc_dt.timetuple().tm_yday
utc_frac = utc_dt.hour + utc_dt.minute / 60.0 + utc_dt.second / 3600.0
decl = math.radians(solar_declination(doy))
lat = math.radians(lat_deg)
ha = math.radians(solar_hour_angle(utc_frac, lon_deg, doy))
sin_elev = (
math.sin(lat) * math.sin(decl)
+ math.cos(lat) * math.cos(decl) * math.cos(ha)
)
sin_elev = max(-1.0, min(1.0, sin_elev))
return math.degrees(math.asin(sin_elev))
def solar_depression_angle(lat_deg: float, lon_deg: float, utc_dt: datetime) -> float:
"""Solar depression angle = negative elevation (positive below horizon)."""
return -solar_elevation(lat_deg, lon_deg, utc_dt)
# ── download helpers ──────────────────────────────────────────────────────────
def download_file(url: str, dest: Path, label: str = "") -> bool:
"""Download a file with progress. Returns True on success."""
if dest.exists() and dest.stat().st_size > 1000:
print(f" [skip] {label or dest.name} already downloaded")
return True
try:
print(f" [download] {label or dest.name} ...", end=" ", flush=True)
r = requests.get(url, timeout=120, stream=True)
r.raise_for_status()
total = int(r.headers.get("content-length", 0))
downloaded = 0
with open(dest, "wb") as f:
for chunk in r.iter_content(chunk_size=65536):
f.write(chunk)
downloaded += len(chunk)
mb = downloaded / 1_048_576
print(f"OK ({mb:.1f} MB)")
return True
except Exception as e:
print(f"FAILED: {e}")
if dest.exists():
dest.unlink()
return False
# ── twilight extraction ───────────────────────────────────────────────────────
def extract_twilight_events(
df: pd.DataFrame,
lat: float,
lon: float,
station_code: str,
station_name: str,
source: str,
threshold: float = PRIMARY_THRESHOLD,
) -> list[dict]:
"""
Given a dataframe with columns [timestamp, ghi] (UTC timestamps),
extract per-day dawn and dusk events.
Dawn: first minute where GHI >= threshold after a run of zeros
Dusk: last minute where GHI >= threshold before a run of zeros
Returns a list of dicts with keys:
date, event_type, utc_time, ghi_wm2, solar_depression_deg,
lat, lon, station_code, station_name, source, threshold_wm2
"""
events = []
# Ensure UTC
if df["timestamp"].dt.tz is None:
df = df.copy()
df["timestamp"] = df["timestamp"].dt.tz_localize("UTC")
else:
df = df.copy()
df["timestamp"] = df["timestamp"].dt.tz_convert("UTC")
# Sort and drop nulls
df = df.dropna(subset=["ghi"]).sort_values("timestamp").reset_index(drop=True)
df["ghi"] = pd.to_numeric(df["ghi"], errors="coerce")
df = df.dropna(subset=["ghi"])
# Group by UTC date (using the date of the timestamp)
df["date"] = df["timestamp"].dt.date
for day, group in df.groupby("date"):
group = group.sort_values("timestamp").reset_index(drop=True)
ghi = group["ghi"].values
ts = group["timestamp"].values # numpy datetime64
# Find dawn: first index where GHI crosses threshold from below
# We require at least 3 consecutive sub-threshold readings before the crossing
dawn_idx = None
for i in range(3, len(ghi)):
if ghi[i] >= threshold and all(g < threshold for g in ghi[max(0, i-3):i]):
dawn_idx = i
break
# Find dusk: last index where GHI is >= threshold followed by sub-threshold
dusk_idx = None
for i in range(len(ghi) - 4, -1, -1):
if ghi[i] >= threshold and all(g < threshold for g in ghi[i+1:min(len(ghi), i+4)]):
dusk_idx = i
break
for idx, event_type in [(dawn_idx, "dawn"), (dusk_idx, "dusk")]:
if idx is None:
continue
ts_val = pd.Timestamp(ts[idx])
utc_dt = ts_val.to_pydatetime().replace(tzinfo=timezone.utc)
dep = solar_depression_angle(lat, lon, utc_dt)
ghi_val = float(ghi[idx])
# Only record events where sun is clearly below horizon
# (eliminates cloudy days where GHI is suppressed during daytime)
if dep < 0: # sun above horizon — skip, not a real twilight event
continue
if dep > 25: # sun more than 25° below — likely data noise
continue
events.append({
"date": str(day),
"event_type": event_type,
"utc_time": utc_dt.strftime("%H:%M"),
"ghi_wm2": round(ghi_val, 2),
"solar_depression_deg": round(dep, 4),
"lat": lat,
"lon": lon,
"station_code": station_code,
"station_name": station_name,
"source": source,
"threshold_wm2": threshold,
})
return events
# ── dataset 1: BSRN 5-site parquet ───────────────────────────────────────────
FIVE_SITE_URL = "https://zenodo.org/records/10593079/files/bsrn_valid_five_sites.parquet"
FIVE_SITE_PATH = RAW_BSRN / "bsrn_valid_five_sites.parquet"
FIVE_SITE_META = {
"mnm": {"name": "Minamitorishima", "lat": 24.2833, "lon": 153.9667},
"asp": {"name": "Alice Springs", "lat": -23.7984, "lon": 133.8880},
"car": {"name": "Carpentras", "lat": 44.0833, "lon": 5.0583},
"bon": {"name": "Bondville", "lat": 40.0667, "lon": -88.3667},
"son": {"name": "Sonnblick", "lat": 47.0542, "lon": 12.9578},
}
def process_five_site() -> list[dict]:
"""Download and process the BSRN 5-site parquet dataset."""
print("\n=== BSRN 5-site dataset (Zenodo 10593079) ===")
ok = download_file(FIVE_SITE_URL, FIVE_SITE_PATH, "bsrn_valid_five_sites.parquet")
if not ok:
print(" ERROR: could not download 5-site dataset")
return []
print(" Loading parquet ...", end=" ", flush=True)
df_all = pd.read_parquet(FIVE_SITE_PATH)
print(f"OK — {len(df_all):,} rows, columns: {list(df_all.columns)}")
all_events = []
# Detect timestamp column
ts_col = None
for c in ["timestamp", "time", "datetime", "date_time", "Timestamp", "TIME"]:
if c in df_all.columns:
ts_col = c
break
if ts_col is None:
# Use first datetime-like column
for c in df_all.columns:
if pd.api.types.is_datetime64_any_dtype(df_all[c]):
ts_col = c
break
if ts_col is None:
print(f" WARNING: cannot find timestamp column. Columns: {list(df_all.columns)[:20]}")
return []
# Detect GHI column
ghi_col = None
for c in ["ghi", "GHI", "global_horizontal_irradiance", "SWD", "swd", "global"]:
if c in df_all.columns:
ghi_col = c
break
if ghi_col is None:
print(f" WARNING: cannot find GHI column. Columns: {list(df_all.columns)[:20]}")
return []
# Detect station column
stn_col = None
for c in ["station", "sta", "site", "STATION"]:
if c in df_all.columns:
stn_col = c
break
print(f" Using ts_col='{ts_col}', ghi_col='{ghi_col}', station_col='{stn_col}'")
if stn_col:
groups = {stn: sub for stn, sub in df_all.groupby(stn_col)}
else:
# Single station file — infer from filename or use first meta entry
groups = {"unknown": df_all}
for stn_code, sub in groups.items():
stn_key = str(stn_code).lower()
meta = FIVE_SITE_META.get(stn_key, None)
if meta is None:
# Try to find lat/lon in data
if "latitude" in sub.columns and "longitude" in sub.columns:
lat = float(sub["latitude"].iloc[0])
lon = float(sub["longitude"].iloc[0])
name = stn_key
else:
print(f" SKIP {stn_key}: no metadata")
continue
else:
lat, lon, name = meta["lat"], meta["lon"], meta["name"]
work = sub[[ts_col, ghi_col]].copy()
work.columns = ["timestamp", "ghi"]
work["timestamp"] = pd.to_datetime(work["timestamp"], utc=True, errors="coerce")
work = work.dropna(subset=["timestamp"])
print(f" Processing {name} ({stn_key}): {len(work):,} rows ...", end=" ", flush=True)
evts = extract_twilight_events(work, lat, lon, stn_key, name, "bsrn_5site")
print(f"{len(evts)} events")
all_events.extend(evts)
return all_events
# ── dataset 2: CAELUS ZIP files ───────────────────────────────────────────────
CAELUS_BASE_URL = "https://zenodo.org/api/records/7897639/files/{filename}/content"
def caelus_filename(station_code: str, year: int) -> str:
return f"{station_code}_bsrn_{year}.zip"
def list_caelus_available() -> dict[str, list[int]]:
"""
Probe Zenodo API to get available files, return dict of station -> [years].
"""
print("\n Querying CAELUS file list from Zenodo API ...", end=" ", flush=True)
try:
r = requests.get("https://zenodo.org/api/records/7897639", timeout=30)
r.raise_for_status()
data = r.json()
available: dict[str, list[int]] = {}
for f in data.get("files", []):
fname = f.get("key", "")
if fname.endswith(".zip") and "_bsrn_" in fname:
parts = fname.replace(".zip", "").split("_bsrn_")
if len(parts) == 2:
code = parts[0]
try:
year = int(parts[1])
available.setdefault(code, []).append(year)
except ValueError:
pass
print(f"found {len(available)} stations")
return available
except Exception as e:
print(f"FAILED: {e}")
return {}
def process_caelus_zip(zip_path: Path, station_code: str, meta: dict, year: int) -> list[dict]:
"""Extract parquet from ZIP, parse, return events."""
events = []
try:
with zipfile.ZipFile(zip_path, "r") as zf:
parquet_files = [n for n in zf.namelist() if n.endswith(".parquet")]
if not parquet_files:
print(f" WARNING: no parquet in {zip_path.name}")
return []
for pf in parquet_files:
with zf.open(pf) as f:
df = pd.read_parquet(BytesIO(f.read()))
# Detect columns
ts_col = None
for c in ["timestamp", "time", "datetime", "Timestamp", "TIME"]:
if c in df.columns:
ts_col = c
break
if ts_col is None:
for c in df.columns:
if pd.api.types.is_datetime64_any_dtype(df[c]):
ts_col = c
break
if ts_col is None:
print(f" WARNING: no timestamp in {pf}")
continue
ghi_col = None
for c in ["ghi", "GHI", "SWD", "swd", "global_horizontal_irradiance", "global"]:
if c in df.columns:
ghi_col = c
break
if ghi_col is None:
# Use diffuse if available
for c in ["dhi", "DHI", "diffuse"]:
if c in df.columns:
ghi_col = c
break
if ghi_col is None:
print(f" WARNING: no GHI column in {pf}. cols={list(df.columns)[:15]}")
continue
work = df[[ts_col, ghi_col]].copy()
work.columns = ["timestamp", "ghi"]
work["timestamp"] = pd.to_datetime(work["timestamp"], utc=True, errors="coerce")
work = work.dropna(subset=["timestamp"])
evts = extract_twilight_events(
work,
meta["lat"], meta["lon"],
station_code, meta["name"],
f"caelus_{year}",
)
events.extend(evts)
except zipfile.BadZipFile as e:
print(f" ERROR: bad zip {zip_path.name}: {e}")
except Exception as e:
print(f" ERROR processing {zip_path.name}: {e}")
return events
def process_caelus(max_stations: int = 8, max_years_per_station: int = 5) -> list[dict]:
"""Download and process CAELUS priority stations."""
print("\n=== CAELUS dataset (Zenodo 7897639) ===")
available = list_caelus_available()
all_events = []
processed_count = 0
for code, meta in PRIORITY_STATIONS.items():
if processed_count >= max_stations:
break
if code not in available:
print(f" SKIP {code} ({meta['name']}): not in CAELUS dataset")
continue
years = sorted(available[code])
years_to_use = years[:max_years_per_station]
print(f"\n Station: {meta['name']} ({code}) | lat={meta['lat']} lon={meta['lon']} | years: {years_to_use}")
station_events = []
for year in years_to_use:
fname = caelus_filename(code, year)
url = CAELUS_BASE_URL.format(filename=fname)
dest = RAW_BSRN / fname
ok = download_file(url, dest, fname)
if not ok:
continue
print(f" Processing {fname} ...", end=" ", flush=True)
evts = process_caelus_zip(dest, code, meta, year)
print(f"{len(evts)} events")
station_events.extend(evts)
print(f" Total events for {code}: {len(station_events)}")
all_events.extend(station_events)
processed_count += 1
return all_events
# ── SURFRAD download ──────────────────────────────────────────────────────────
SURFRAD_STATIONS = {
"dra": {
"name": "Desert Rock",
"lat": 36.626, "lon": -116.018,
"ftp_dir": "DRA",
},
"tbl": {
"name": "Table Mountain",
"lat": 40.125, "lon": -105.237,
"ftp_dir": "TBL",
},
}
SURFRAD_BASE = "https://gml.noaa.gov/aftp/data/radiation/surfrad"
def parse_surfrad_file(content: bytes, station_meta: dict) -> pd.DataFrame:
"""
Parse SURFRAD dat file format.
Returns dataframe with [timestamp, ghi] columns.
"""
lines = content.decode("utf-8", errors="replace").splitlines()
# Skip header (first line is site info, second is column header)
# Format: year jday hour min dt zen dw_solar dw_solar_stdev ...
records = []
for line in lines[2:]:
parts = line.split()
if len(parts) < 9:
continue
try:
yr = int(parts[0])
jday = int(parts[1])
hr = int(parts[2])
mn = int(parts[3])
# GHI is column index 6 (dw_solar = downwelling shortwave)
ghi_raw = float(parts[6])
# Build datetime
dt = datetime(yr, 1, 1, hr, mn, tzinfo=timezone.utc) + timedelta(days=jday - 1)
# SURFRAD uses -9999 for missing
ghi = None if ghi_raw < -900 else max(0.0, ghi_raw)
records.append({"timestamp": dt, "ghi": ghi})
except (ValueError, IndexError):
continue
return pd.DataFrame(records)
def process_surfrad(years: list[int] | None = None) -> list[dict]:
"""Download and process SURFRAD Desert Rock and Table Mountain."""
print("\n=== SURFRAD dataset (NOAA GML FTP) ===")
if years is None:
years = [2018, 2019, 2020] # 3 recent years
all_events = []
for code, meta in SURFRAD_STATIONS.items():
station_dir = meta["ftp_dir"]
print(f"\n Station: {meta['name']} ({code}) | lat={meta['lat']} lon={meta['lon']}")
for year in years:
print(f" Year {year}:")
station_events = []
# SURFRAD stores one file per day: DRA/YYYY/draYYYjjj.dat
# where jjj = day of year (001-365)
# We'll try to get the full year index
year_dir_url = f"{SURFRAD_BASE}/{station_dir}/{year}/"
try:
r = requests.get(year_dir_url, timeout=30)
if r.status_code != 200:
print(f" Cannot access {year_dir_url}: {r.status_code}")
continue
# Parse file listing (Apache directory index)
import re
files = re.findall(rf'{code.lower()}\d{{5}}\.dat', r.text)
files = sorted(set(files))
print(f" Found {len(files)} daily files")
dest_dir = RAW_BSRN / "surfrad" / station_dir / str(year)
dest_dir.mkdir(parents=True, exist_ok=True)
day_records = []
for fname in files:
dest = dest_dir / fname
file_url = f"{year_dir_url}{fname}"
ok = download_file(file_url, dest, fname)
if not ok:
continue
with open(dest, "rb") as f:
content = f.read()
day_df = parse_surfrad_file(content, meta)
if not day_df.empty:
day_records.append(day_df)
if day_records:
df_year = pd.concat(day_records, ignore_index=True)
print(f" {len(df_year):,} 1-min records loaded", end=" ")
evts = extract_twilight_events(
df_year, meta["lat"], meta["lon"],
code, meta["name"], f"surfrad_{year}",
)
print(f"{len(evts)} twilight events")
station_events.extend(evts)
except Exception as e:
print(f" ERROR: {e}")
continue
all_events.extend(station_events)
return all_events
# ── save results ──────────────────────────────────────────────────────────────
def save_sightings(events: list[dict], filename: str) -> Path:
"""Save events list to CSV."""
if not events:
print(f" WARNING: no events to save to {filename}")
return RAW_SIGHTINGS / filename
df = pd.DataFrame(events)
df = df.sort_values(["station_code", "date", "event_type"]).reset_index(drop=True)
out = RAW_SIGHTINGS / filename
df.to_csv(out, index=False)
print(f" Saved {len(df)} events → {out}")
return out
def print_summary(events: list[dict], label: str):
if not events:
print(f"\n{label}: no events")
return
df = pd.DataFrame(events)
print(f"\n{'='*60}")
print(f"SUMMARY: {label}")
print(f" Total events: {len(df)}")
print(f" Dawn events: {(df['event_type']=='dawn').sum()}")
print(f" Dusk events: {(df['event_type']=='dusk').sum()}")
print(f" Stations: {df['station_code'].nunique()}{sorted(df['station_code'].unique())}")
print(f" Date range: {df['date'].min()} to {df['date'].max()}")
dep = df["solar_depression_deg"]
print(f" Depression angle: mean={dep.mean():.2f}° std={dep.std():.2f}° "
f"min={dep.min():.2f}° max={dep.max():.2f}°")
if "event_type" in df.columns:
for evt_type in ["dawn", "dusk"]:
sub = df[df["event_type"] == evt_type]
if not sub.empty:
d = sub["solar_depression_deg"]
print(f" {evt_type.upper()} depression: mean={d.mean():.2f}° "
f"p10={d.quantile(0.1):.2f}° p90={d.quantile(0.9):.2f}°")
print(f"{'='*60}")
# ── main ──────────────────────────────────────────────────────────────────────
def main():
print("BSRN Twilight Extractor")
print(f"Output dir: {RAW_SIGHTINGS}")
print(f"Raw data: {RAW_BSRN}")
print(f"GHI threshold: {PRIMARY_THRESHOLD} W/m²\n")
all_events = []
# 1. BSRN 5-site (quick, 58.5 MB)
five_site_events = process_five_site()
save_sightings(five_site_events, "bsrn_5site_twilight.csv")
print_summary(five_site_events, "BSRN 5-site dataset")
all_events.extend(five_site_events)
# 2. CAELUS (priority stations, up to 8 stations × 5 years)
caelus_events = process_caelus(max_stations=8, max_years_per_station=5)
save_sightings(caelus_events, "bsrn_caelus_twilight.csv")
print_summary(caelus_events, "CAELUS dataset")
all_events.extend(caelus_events)
# 3. SURFRAD (Desert Rock + Table Mountain)
surfrad_events = process_surfrad(years=[2018, 2019, 2020])
save_sightings(surfrad_events, "surfrad_twilight.csv")
print_summary(surfrad_events, "SURFRAD dataset")
all_events.extend(surfrad_events)
# Combined
save_sightings(all_events, "bsrn_all_twilight.csv")
print_summary(all_events, "ALL SOURCES COMBINED")
print("\nDone.")
if __name__ == "__main__":
main()

563
src/bsrn_process.py Normal file
View file

@ -0,0 +1,563 @@
"""
BSRN Twilight Extractor corrected for actual file formats.
Dataset formats:
1. BSRN 5-site parquet (Zenodo 10593079):
- Multi-index: (times_utc, site)
- Columns: climate, longitude, latitude, sza, eth, ghicda, ghics, difcs, ghi, dif, sky_type
- 'site' values like 'asp/bsrn', 'bon/bsrn', etc.
2. CAELUS ZIPs (Zenodo 7897639):
- Each ZIP contains one CSV named {code}_bsrn_{year}.csv
- Columns: times_utc, sza, eth, ghi, dif, ghics, ghicda, sky_type
- sza = solar zenith angle (degrees). Depression = sza - 90 when sza > 90.
For twilight extraction we use BOTH ghi AND dif (diffuse horizontal irradiance).
Diffuse is more sensitive at twilight it captures scattered skylight before direct beam arrives.
The `sza` column is authoritative for solar geometry no need to recompute.
Depression angle = sza - 90° (valid when sza > 90°, i.e., sun below horizon).
Thresholds extracted (multi-threshold approach):
- 0.5, 1, 2, 5, 10 W/ for both ghi and dif
Primary threshold for the main output: dif >= 1.0 W/ (diffuse is more reliable at twilight).
"""
import os
import math
import zipfile
import requests
import re
import pandas as pd
import numpy as np
from datetime import datetime, timezone, timedelta
from pathlib import Path
from io import BytesIO
# ── paths ─────────────────────────────────────────────────────────────────────
BASE = Path("/Volumes/X9/Sites/acamarata/pray-calc-ml")
RAW_BSRN = BASE / "data/raw/bsrn"
RAW_SIGHTINGS = BASE / "data/raw/raw_sightings"
RAW_BSRN.mkdir(parents=True, exist_ok=True)
RAW_SIGHTINGS.mkdir(parents=True, exist_ok=True)
# ── thresholds ────────────────────────────────────────────────────────────────
THRESHOLDS = [0.5, 1.0, 2.0, 5.0, 10.0]
PRIMARY_THRESHOLD = 1.0 # W/m²
# ── station metadata ──────────────────────────────────────────────────────────
# For CAELUS: station codes and their coordinates
CAELUS_STATION_META = {
"sbo": {"name": "Sede Boqer", "lat": 30.8597, "lon": 34.7794, "region": "Negev/Israel"},
"tam": {"name": "Tamanrasset", "lat": 22.7903, "lon": 5.5292, "region": "Algeria"},
"xia": {"name": "Xianghe", "lat": 39.754, "lon": 116.962, "region": "China"},
"iza": {"name": "Izana", "lat": 28.3093, "lon": -16.4993, "region": "Canary Islands/Spain"},
"ish": {"name": "Ishigakijima", "lat": 24.3367, "lon": 124.1644, "region": "Japan"},
"dra": {"name": "Desert Rock", "lat": 36.626, "lon": -116.018, "region": "Nevada/USA"},
"e13": {"name": "SouthernGreatPlains", "lat": 36.605, "lon": -97.485, "region": "Oklahoma/USA"},
"tat": {"name": "Tateno", "lat": 36.0581, "lon": 140.1258, "region": "Japan"},
"fua": {"name": "Fukuoka", "lat": 33.5822, "lon": 130.3764, "region": "Japan"},
"gcr": {"name": "GoodwinCreek", "lat": 34.2547, "lon": -89.8729, "region": "Mississippi/USA"},
"bil": {"name": "Billings/ARM", "lat": 36.605, "lon": -97.516, "region": "Oklahoma/USA"},
"abs": {"name": "Abashiri", "lat": 44.0178, "lon": 144.2797, "region": "Japan"},
"lyu": {"name": "Lanyu Island", "lat": 22.037, "lon": 121.5583, "region": "Taiwan"},
"lln": {"name": "Lulin", "lat": 23.4686, "lon": 120.8736, "region": "Taiwan"},
"cnr": {"name": "Cener", "lat": 42.816, "lon": -1.601, "region": "Spain/Navarra"},
"flo": {"name": "Florina", "lat": 40.78, "lon": 21.38, "region": "Greece"},
"car": {"name": "Carpentras", "lat": 44.0833, "lon": 5.0583, "region": "France"},
}
# Five-site parquet station metadata (lat/lon stored in the file itself)
FIVE_SITE_MAP = {
"mnm": {"name": "Minamitorishima", "region": "Japan"},
"asp": {"name": "Alice Springs", "region": "Australia"},
"car": {"name": "Carpentras", "region": "France"},
"bon": {"name": "Bondville", "region": "USA/Illinois"},
"son": {"name": "Sonnblick", "region": "Austria"},
}
CAELUS_BASE_URL = "https://zenodo.org/api/records/7897639/files/{filename}/content"
FIVE_SITE_URL = "https://zenodo.org/records/10593079/files/bsrn_valid_five_sites.parquet"
FIVE_SITE_PATH = RAW_BSRN / "bsrn_valid_five_sites.parquet"
# ── download helper ───────────────────────────────────────────────────────────
def download_file(url: str, dest: Path, label: str = "") -> bool:
if dest.exists() and dest.stat().st_size > 1000:
print(f" [cached] {label or dest.name}")
return True
try:
print(f" [dl] {label or dest.name} ...", end=" ", flush=True)
r = requests.get(url, timeout=180, stream=True)
r.raise_for_status()
downloaded = 0
with open(dest, "wb") as f:
for chunk in r.iter_content(chunk_size=131072):
f.write(chunk)
downloaded += len(chunk)
print(f"OK ({downloaded/1_048_576:.1f} MB)")
return True
except Exception as e:
print(f"FAILED: {e}")
if dest.exists():
dest.unlink()
return False
# ── twilight extraction ───────────────────────────────────────────────────────
def extract_events_from_df(
df: pd.DataFrame,
lat: float,
lon: float,
station_code: str,
station_name: str,
source: str,
) -> list[dict]:
"""
df must have columns: times_utc (datetime64[ns] UTC), sza (float), ghi (float), dif (float).
For each day:
dawn = first minute where dif (or ghi if dif is null) crosses PRIMARY_THRESHOLD
AND sza > 90 (sun still below horizon)
dusk = last minute where dif (or ghi) >= PRIMARY_THRESHOLD
AND sza > 90
Records depression angle = sza - 90.0 at the event moment.
Also records raw threshold-by-threshold data for all THRESHOLDS.
"""
events = []
df = df.copy()
df["times_utc"] = pd.to_datetime(df["times_utc"], utc=True, errors="coerce")
df = df.dropna(subset=["times_utc"]).sort_values("times_utc").reset_index(drop=True)
# Build irradiance signal: prefer dif (diffuse) which is more sensitive at twilight
# Fall back to ghi if dif is all NaN
if "dif" in df.columns:
df["irr"] = pd.to_numeric(df["dif"], errors="coerce")
else:
df["irr"] = float("nan")
if "ghi" in df.columns:
ghi_vals = pd.to_numeric(df["ghi"], errors="coerce")
# Fill irr NaN with ghi
df["irr"] = df["irr"].combine_first(ghi_vals)
df["sza"] = pd.to_numeric(df["sza"], errors="coerce")
df["date"] = df["times_utc"].dt.date
for day, grp in df.groupby("date"):
grp = grp.sort_values("times_utc").reset_index(drop=True)
irr = grp["irr"].values
sza = grp["sza"].values
ts = grp["times_utc"].values
n = len(irr)
if n < 10:
continue
# For each threshold, extract dawn/dusk
for threshold in THRESHOLDS:
# Dawn: first index where irr >= threshold AND sza > 90
# Require at least 2 consecutive sub-threshold readings immediately before
dawn_idx = None
for i in range(2, n):
if (
not np.isnan(irr[i])
and irr[i] >= threshold
and not np.isnan(sza[i])
and sza[i] > 90.0
):
# Confirm at least 2 prior readings were sub-threshold
prev_ok = all(
(np.isnan(v) or v < threshold)
for v in irr[max(0, i-3):i]
)
if prev_ok:
dawn_idx = i
break
# Dusk: last index where irr >= threshold AND sza > 90
# Require at least 2 consecutive sub-threshold readings immediately after
dusk_idx = None
for i in range(n - 3, -1, -1):
if (
not np.isnan(irr[i])
and irr[i] >= threshold
and not np.isnan(sza[i])
and sza[i] > 90.0
):
after_ok = all(
(np.isnan(v) or v < threshold)
for v in irr[i+1:min(n, i+4)]
)
if after_ok:
dusk_idx = i
break
for idx, event_type in [(dawn_idx, "dawn"), (dusk_idx, "dusk")]:
if idx is None:
continue
sza_val = float(sza[idx])
# sza must be > 90 (sun below horizon) — extra safety check
if sza_val <= 90.0:
continue
# Depression angle in degrees
dep = round(sza_val - 90.0, 4)
# Only record plausible twilight depression (0.5° to 20°)
if dep < 0.3 or dep > 20.0:
continue
utc_ts = pd.Timestamp(ts[idx])
irr_val = float(irr[idx])
# Also grab ghi/dif individually at this moment
ghi_at = float(grp["ghi"].iloc[idx]) if "ghi" in grp.columns else float("nan")
dif_at = float(grp["dif"].iloc[idx]) if "dif" in grp.columns else float("nan")
events.append({
"date": str(day),
"event_type": event_type,
"utc_time": utc_ts.strftime("%H:%M:%S"),
"sza_deg": round(sza_val, 4),
"solar_depression_deg": dep,
"irr_wm2": round(irr_val, 2),
"ghi_wm2": round(ghi_at, 2) if not math.isnan(ghi_at) else None,
"dif_wm2": round(dif_at, 2) if not math.isnan(dif_at) else None,
"threshold_wm2": threshold,
"lat": lat,
"lon": lon,
"station_code": station_code,
"station_name": station_name,
"source": source,
})
return events
# ── 5-site dataset ────────────────────────────────────────────────────────────
def process_five_site() -> list[dict]:
print("\n=== BSRN 5-site parquet (Zenodo 10593079) ===")
ok = download_file(FIVE_SITE_URL, FIVE_SITE_PATH, "bsrn_valid_five_sites.parquet")
if not ok:
return []
print(" Loading ...", end=" ", flush=True)
df_raw = pd.read_parquet(FIVE_SITE_PATH)
df_raw = df_raw.reset_index() # brings times_utc and site into columns
print(f"OK — {len(df_raw):,} rows")
print(f" Columns: {list(df_raw.columns)}")
all_events = []
for site_val, grp in df_raw.groupby("site"):
# site_val is like 'asp/bsrn'
code = str(site_val).split("/")[0].lower()
meta = FIVE_SITE_MAP.get(code, {"name": code, "region": "unknown"})
# Get lat/lon from the data itself
lat = float(grp["latitude"].iloc[0])
lon = float(grp["longitude"].iloc[0])
# Rename columns to standard
sub = grp[["times_utc", "sza", "ghi", "dif"]].copy()
print(f" {meta['name']} ({code}) lat={lat:.3f} lon={lon:.3f}: "
f"{len(sub):,} rows ...", end=" ", flush=True)
evts = extract_events_from_df(sub, lat, lon, code, meta["name"], "bsrn_5site")
print(f"{len(evts)} events")
all_events.extend(evts)
return all_events
# ── CAELUS dataset ─────────────────────────────────────────────────────────────
def list_caelus_files() -> dict[str, list[int]]:
"""Return dict of station_code -> sorted list of available years."""
print(" Querying Zenodo API ...", end=" ", flush=True)
try:
r = requests.get("https://zenodo.org/api/records/7897639", timeout=30)
r.raise_for_status()
data = r.json()
available: dict[str, list[int]] = {}
for f in data.get("files", []):
fname = f.get("key", "")
if fname.endswith(".zip") and "_bsrn_" in fname:
parts = fname.replace(".zip", "").split("_bsrn_")
if len(parts) == 2:
code = parts[0]
try:
year = int(parts[1])
available.setdefault(code, []).append(year)
except ValueError:
pass
for k in available:
available[k] = sorted(available[k])
print(f"{len(available)} stations found")
return available
except Exception as e:
print(f"FAILED: {e}")
return {}
def process_caelus_zip(zip_path: Path, station_code: str, meta: dict, year: int) -> list[dict]:
events = []
try:
with zipfile.ZipFile(zip_path, "r") as zf:
csv_files = [n for n in zf.namelist() if n.endswith(".csv")]
if not csv_files:
print(f" WARNING: no CSV in {zip_path.name}")
return []
with zf.open(csv_files[0]) as f:
df = pd.read_csv(f)
# Validate expected columns
required = {"times_utc", "sza", "ghi"}
missing = required - set(df.columns)
if missing:
print(f" WARNING: missing columns {missing} in {zip_path.name}")
return []
evts = extract_events_from_df(
df, meta["lat"], meta["lon"],
station_code, meta["name"], f"caelus_{year}",
)
return evts
except zipfile.BadZipFile:
print(f" ERROR: corrupt zip {zip_path.name}")
return []
except Exception as e:
print(f" ERROR {zip_path.name}: {e}")
return []
def process_caelus(priority_codes: list[str] | None = None, max_years: int = 5) -> list[dict]:
print("\n=== CAELUS dataset (Zenodo 7897639) ===")
available = list_caelus_files()
if not available:
return []
if priority_codes is None:
# All known stations, sorted by relevance
priority_codes = list(CAELUS_STATION_META.keys())
all_events = []
for code in priority_codes:
if code not in available:
print(f" SKIP {code}: not in CAELUS")
continue
meta = CAELUS_STATION_META.get(code)
if meta is None:
print(f" SKIP {code}: no metadata")
continue
years = available[code][:max_years]
print(f"\n {meta['name']} ({code}) "
f"lat={meta['lat']} lon={meta['lon']} region={meta['region']}")
print(f" Years: {years}")
station_events = []
for year in years:
fname = f"{code}_bsrn_{year}.zip"
url = CAELUS_BASE_URL.format(filename=fname)
dest = RAW_BSRN / fname
ok = download_file(url, dest, fname)
if not ok:
continue
print(f" {fname} ...", end=" ", flush=True)
evts = process_caelus_zip(dest, code, meta, year)
print(f"{len(evts)} events ({year})")
station_events.extend(evts)
print(f"{len(station_events)} events total for {code}")
all_events.extend(station_events)
return all_events
# ── SURFRAD ───────────────────────────────────────────────────────────────────
SURFRAD_STATIONS = {
"dra": {"name": "Desert Rock", "lat": 36.626, "lon": -116.018, "dir": "DRA"},
"tbl": {"name": "TableMountain","lat": 40.125, "lon": -105.237, "dir": "TBL"},
}
SURFRAD_BASE = "https://gml.noaa.gov/aftp/data/radiation/surfrad"
def parse_surfrad_day(content: bytes) -> pd.DataFrame:
"""Parse a SURFRAD daily .dat file to a dataframe with [times_utc, sza, ghi, dif]."""
lines = content.decode("utf-8", errors="replace").splitlines()
records = []
for line in lines[2:]:
parts = line.split()
if len(parts) < 10:
continue
try:
yr = int(parts[0])
jday = int(parts[1])
hr = int(parts[2])
mn = int(parts[3])
zen = float(parts[5]) # solar zenith angle (degrees)
ghi = float(parts[6]) # downwelling shortwave (W/m²)
dif = float(parts[9]) # diffuse shortwave (W/m²)
dt = datetime(yr, 1, 1, hr, mn, tzinfo=timezone.utc) + timedelta(days=jday - 1)
# -9999 = missing
records.append({
"times_utc": dt,
"sza": None if zen < -900 else zen,
"ghi": None if ghi < -900 else max(0.0, ghi),
"dif": None if dif < -900 else max(0.0, dif),
})
except (ValueError, IndexError):
continue
return pd.DataFrame(records)
def process_surfrad(years: list[int] | None = None) -> list[dict]:
print("\n=== SURFRAD (NOAA GML) ===")
if years is None:
years = [2018, 2019, 2020]
all_events = []
for code, meta in SURFRAD_STATIONS.items():
print(f"\n {meta['name']} ({code}) lat={meta['lat']} lon={meta['lon']}")
dest_root = RAW_BSRN / "surfrad" / meta["dir"]
dest_root.mkdir(parents=True, exist_ok=True)
for year in years:
year_url = f"{SURFRAD_BASE}/{meta['dir']}/{year}/"
try:
r = requests.get(year_url, timeout=30)
if r.status_code != 200:
print(f" {year}: HTTP {r.status_code} — skipping")
continue
# Parse file listing
files = sorted(set(re.findall(rf'{code}\d{{5}}\.dat', r.text)))
print(f" {year}: {len(files)} daily files", end=" ")
except Exception as e:
print(f" {year}: ERROR listing: {e}")
continue
dest_dir = dest_root / str(year)
dest_dir.mkdir(parents=True, exist_ok=True)
day_dfs = []
for fname in files:
dest = dest_dir / fname
furl = f"{year_url}{fname}"
ok = download_file(furl, dest, fname)
if not ok:
continue
with open(dest, "rb") as f:
day_df = parse_surfrad_day(f.read())
if not day_df.empty:
day_dfs.append(day_df)
if not day_dfs:
print("→ 0 events")
continue
df_year = pd.concat(day_dfs, ignore_index=True)
evts = extract_events_from_df(
df_year, meta["lat"], meta["lon"],
code, meta["name"], f"surfrad_{year}",
)
print(f"{len(evts)} events")
all_events.extend(evts)
return all_events
# ── save / summarise ──────────────────────────────────────────────────────────
def save_events(events: list[dict], filename: str) -> Path:
out = RAW_SIGHTINGS / filename
if not events:
print(f" (no events for {filename})")
return out
df = pd.DataFrame(events)
df = df.sort_values(["station_code", "date", "event_type", "threshold_wm2"])
df.to_csv(out, index=False)
print(f" Saved {len(df):,} rows → {out}")
return out
def summarise(events: list[dict], label: str):
print(f"\n{'='*65}")
print(f"SUMMARY: {label}")
if not events:
print(" (no events)")
print("=" * 65)
return
df = pd.DataFrame(events)
# Focus summary on primary threshold
prim = df[df["threshold_wm2"] == PRIMARY_THRESHOLD]
print(f" All threshold events: {len(df):,}")
print(f" At {PRIMARY_THRESHOLD} W/m² threshold: {len(prim):,}")
print(f" Stations: {df['station_code'].nunique()}{sorted(df['station_code'].unique())}")
if not prim.empty:
print(f" Date range: {prim['date'].min()} to {prim['date'].max()}")
for evt_type in ["dawn", "dusk"]:
sub = prim[prim["event_type"] == evt_type]
if sub.empty:
continue
d = sub["solar_depression_deg"]
print(f" {evt_type.upper()} events: {len(sub):,} | "
f"mean dep={d.mean():.2f}° std={d.std():.2f}° "
f"p10={d.quantile(0.1):.2f}° p90={d.quantile(0.9):.2f}°")
print("=" * 65)
# ── main ──────────────────────────────────────────────────────────────────────
def main():
print("BSRN Twilight Extractor v2")
print(f"Primary threshold: {PRIMARY_THRESHOLD} W/m²")
print(f"All thresholds: {THRESHOLDS}")
print()
all_events = []
# 1. BSRN 5-site
five_events = process_five_site()
save_events(five_events, "bsrn_5site_twilight.csv")
summarise(five_events, "BSRN 5-site")
all_events.extend(five_events)
# 2. CAELUS — priority stations in 20-45°N
priority = ["sbo", "tam", "xia", "iza", "ish", "dra", "e13", "tat", "fua",
"gcr", "bil", "abs", "lyu", "cnr", "flo"]
caelus_events = process_caelus(priority_codes=priority, max_years=5)
save_events(caelus_events, "bsrn_caelus_twilight.csv")
summarise(caelus_events, "CAELUS")
all_events.extend(caelus_events)
# 3. SURFRAD — Desert Rock (arid, 36.6°N) + Table Mountain
surfrad_events = process_surfrad(years=[2018, 2019, 2020])
save_events(surfrad_events, "surfrad_twilight.csv")
summarise(surfrad_events, "SURFRAD")
all_events.extend(surfrad_events)
# Combined output
save_events(all_events, "bsrn_all_twilight.csv")
summarise(all_events, "ALL SOURCES")
print("\nDone.")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,93 @@
import requests
import os
import re
from pathlib import Path
import time
PAPERS_DIR = Path("research/downloaded_papers")
PAPERS_DIR.mkdir(parents=True, exist_ok=True)
# Queries targeting Islamic twilight observations
QUERIES = [
"fajr twilight observation angle",
"subuh sky brightness meter",
"shafaq twilight measurement isha",
"depression angle fajr isha",
"true dawn observation measurements"
]
def fetch_openalex_papers(query):
url = f"https://api.openalex.org/works?search={query}&per-page=50&filter=has_pdf_url:true"
print(f"Querying OpenAlex: {query}")
try:
res = requests.get(url, timeout=15)
if res.status_code != 200:
return []
data = res.json()
return data.get("results", [])
except Exception as e:
print(f"Error fetching {query}: {e}")
return []
def sanitize_filename(title):
return re.sub(r"[^a-zA-Z0-9]+", "_", title).strip("_")
def main():
seen_titles = set()
total_downloaded = 0
# Check existing files to avoid duplicates
existing_files = list(PAPERS_DIR.glob("*.pdf"))
for file in existing_files:
name_no_ext = file.stem.lower()
seen_titles.add(name_no_ext)
for query in QUERIES:
works = fetch_openalex_papers(query)
for work in works:
title = work.get("title")
if not title:
continue
clean_title = sanitize_filename(title).lower()
# Simple duplicate check
if any(clean_title[:30] in seen[:30] for seen in seen_titles):
print(f"Skipping duplicate/known: {title}")
continue
pdf_url = work.get("open_access", {}).get("oa_url")
if not pdf_url or not pdf_url.endswith(".pdf"):
continue
year = work.get("publication_year", "0000")
filename = f"{year}_{clean_title[:60]}.pdf"
filepath = PAPERS_DIR / filename
if filepath.exists():
print(f"Already downloaded: {filename}")
continue
print(f"Downloading: {title} ({year})")
print(f" URL: {pdf_url}")
try:
# Add headers to mimic browser
headers = {'User-Agent': 'Mozilla/5.0'}
pdf_res = requests.get(pdf_url, headers=headers, timeout=20)
if pdf_res.status_code == 200:
with open(filepath, "wb") as f:
f.write(pdf_res.content)
print(f" -> Saved as {filename}")
seen_titles.add(clean_title)
total_downloaded += 1
else:
print(f" -> Failed with status {pdf_res.status_code}")
except Exception as e:
print(f" -> Download failed: {e}")
time.sleep(1) # Be nice to servers
print(f"\nTotal new papers downloaded: {total_downloaded}")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,226 @@
"""
Download all GaN-MN monthly and annual CSV files, then process each one
with the GaN-MN processor and save results to data/raw/raw_sightings/.
Run from repo root:
/Volumes/X9/Sites/acamarata/pray-calc-ml/.venv/bin/python \
src/collect/download_gan_mn.py
"""
from __future__ import annotations
import csv
import logging
import sys
import time
from pathlib import Path
import requests
# Add repo root to path so we can import the processor
REPO_ROOT = Path(__file__).resolve().parent.parent.parent
sys.path.insert(0, str(REPO_ROOT))
from src.collect.gan_mn_processor import process_gan_mn_csv # noqa: E402
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
datefmt="%H:%M:%S",
)
log = logging.getLogger(__name__)
RAW_DIR = REPO_ROOT / "data" / "raw" / "gan_mn"
SIGHTINGS_DIR = REPO_ROOT / "data" / "raw" / "raw_sightings"
RAW_DIR.mkdir(parents=True, exist_ok=True)
SIGHTINGS_DIR.mkdir(parents=True, exist_ok=True)
# All monthly files, 2022-2025 (skip already-downloaded ones)
# Format: (label, bit.ly URL, local filename)
MONTHLY_FILES = [
# 2025 (skip Jun 2025 and Jan 2025 — already have them)
("Aug_2025", "https://bit.ly/4a9aXIP", "GaN-MN_August_2025.csv"),
("Sep_2025", "https://bit.ly/4sm4pxY", "GaN-MN_September_2025.csv"),
("Jul_2025", "https://bit.ly/4jlEQtz", "GaN-MN_July_2025.csv"),
("May_2025", "https://bit.ly/47w5sCA", "GaN-MN_May_2025.csv"),
("Apr_2025", "https://bit.ly/4nxs6Bm", "GaN-MN_April_2025.csv"),
("Mar_2025", "https://bit.ly/47mPHzx", "GaN-MN_March_2025.csv"),
("Feb_2025", "https://bit.ly/4lbwPXg", "GaN-MN_February_2025.csv"),
# 2024 (skip Aug 2024 — already have it)
("Dec_2024", "https://bit.ly/4mHSBno", "GaN-MN_December_2024.csv"),
("Nov_2024", "https://bit.ly/4lVwidL", "GaN-MN_November_2024.csv"),
("Oct_2024", "https://bit.ly/449h3ay", "GaN-MN_October_2024.csv"),
("Sep_2024", "https://bit.ly/4hUE458", "GaN-MN_September_2024.csv"),
("Jul_2024", "https://bit.ly/4fAjvJi", "GaN-MN_July_2024.csv"),
("Jun_2024", "https://bit.ly/3ZvT1DR", "GaN-MN_June_2024.csv"),
("May_2024", "https://bit.ly/4f3veR8", "GaN-MN_May_2024.csv"),
("Apr_2024", "https://bit.ly/4etfYg0", "GaN-MN_April_2024.csv"),
("Mar_2024", "https://bit.ly/3Z69pLy", "GaN-MN_March_2024.csv"),
("Feb_2024", "https://bit.ly/46wtyfJ", "GaN-MN_February_2024.csv"),
("Jan_2024", "https://bit.ly/4cMztz7", "GaN-MN_January_2024.csv"),
# 2023
("Dec_2023", "https://bit.ly/45dEDl3", "GaN-MN_December_2023.csv"),
("Nov_2023", "https://bit.ly/4ajnV4v", "GaN-MN_November_2023.csv"),
("Oct_2023", "https://bit.ly/3PKzryD", "GaN-MN_October_2023.csv"),
("Sep_2023", "https://bit.ly/3ww3T8K", "GaN-MN_September_2023.csv"),
("Aug_2023", "https://bit.ly/3So1UKV", "GaN-MN_August_2023.csv"),
("Jul_2023", "https://bit.ly/3S3mG3u", "GaN-MN_July_2023.csv"),
("Jun_2023", "https://bit.ly/46Es4OL", "GaN-MN_June_2023.csv"),
("May_2023", "https://bit.ly/46WbF9m", "GaN-MN_May_2023.csv"),
("Apr_2023", "https://bit.ly/3rxWWSy", "GaN-MN_April_2023.csv"),
("Mar_2023", "https://bit.ly/3OW45nv", "GaN-MN_March_2023.csv"),
("Feb_2023", "https://bit.ly/3rSY2br", "GaN-MN_February_2023.csv"),
("Jan_2023", "https://bit.ly/3JGJpy1", "GaN-MN_January_2023.csv"),
# 2022
("Dec_2022", "https://bit.ly/43CelXB", "GaN-MN_December_2022.csv"),
("Nov_2022", "https://bit.ly/429RdyR", "GaN-MN_November_2022.csv"),
("Oct_2022", "https://bit.ly/3m1onBm", "GaN-MN_October_2022.csv"),
("Sep_2022", "http://bit.ly/3SHQJN5", "GaN-MN_September_2022.csv"),
("Aug_2022", "http://bit.ly/40hFpdP", "GaN-MN_August_2022.csv"),
("Jul_2022", "https://bit.ly/3Z487OD", "GaN-MN_July_2022.csv"),
("Jun_2022", "https://bit.ly/3H5CsWS", "GaN-MN_June_2022.csv"),
("May_2022", "https://bit.ly/3fnFCJT", "GaN-MN_May_2022.csv"),
("Apr_2022", "https://bit.ly/3E8flJr", "GaN-MN_April_2022.csv"),
("Mar_2022", "https://bit.ly/3B76VAq", "GaN-MN_March_2022.csv"),
("Feb_2022", "http://bit.ly/3BCkXe1", "GaN-MN_February_2022.csv"),
("Jan_2022", "http://bit.ly/3IgfkU4", "GaN-MN_January_2022.csv"),
]
# Annual file for 2021 (one representative year)
ANNUAL_FILES = [
("2021", "https://bit.ly/44vtt9T", "GaN-MN_Annual_2021.csv"),
]
FIELDNAMES = [
"prayer", "date_local", "time_local", "utc_offset",
"lat", "lng", "elevation_m", "source", "notes",
]
def resolve_and_download(label: str, url: str, dest: Path) -> bool:
"""
Follow bit.ly redirect and download the file to dest.
Returns True on success.
"""
if dest.exists():
log.info("Already have %s — skipping download", dest.name)
return True
log.info("Downloading %s from %s ...", label, url)
try:
# bit.ly does a redirect; requests follows by default
headers = {
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/124.0.0.0 Safari/537.36"
)
}
with requests.get(url, headers=headers, stream=True, timeout=120) as resp:
resp.raise_for_status()
total = int(resp.headers.get("content-length", 0))
downloaded = 0
with open(dest, "wb") as fh:
for chunk in resp.iter_content(chunk_size=1 << 20): # 1 MB chunks
if chunk:
fh.write(chunk)
downloaded += len(chunk)
mb = downloaded / (1 << 20)
log.info("Downloaded %s: %.1f MB", dest.name, mb)
return True
except Exception as exc:
log.error("Failed to download %s: %s", label, exc)
if dest.exists():
dest.unlink()
return False
def process_and_save(label: str, csv_path: Path) -> int:
"""
Run the GaN-MN processor on csv_path and write results to
data/raw/raw_sightings/gan_mn_{label_lower}.csv.
Returns count of records written.
"""
slug = label.lower().replace(" ", "_")
out_path = SIGHTINGS_DIR / f"gan_mn_{slug}.csv"
if out_path.exists():
# Count existing rows
with open(out_path) as fh:
existing = sum(1 for _ in fh) - 1 # subtract header
log.info("Already processed %s (%d rows) — skipping", out_path.name, existing)
return existing
log.info("Processing %s ...", csv_path.name)
try:
records = process_gan_mn_csv(csv_path)
except Exception as exc:
log.error("Processing failed for %s: %s", csv_path.name, exc)
return 0
if not records:
log.warning("No records extracted from %s", csv_path.name)
return 0
with open(out_path, "w", newline="") as fh:
writer = csv.DictWriter(fh, fieldnames=FIELDNAMES)
writer.writeheader()
writer.writerows(records)
log.info("Wrote %d records to %s", len(records), out_path.name)
return len(records)
def main() -> None:
all_files = MONTHLY_FILES + ANNUAL_FILES
total_downloaded = 0
total_records = 0
skipped_download = 0
failed_download = []
results = []
for label, url, filename in all_files:
dest = RAW_DIR / filename
ok = resolve_and_download(label, url, dest)
if not ok:
failed_download.append(label)
continue
if dest.stat().st_size < 1000:
log.error("%s appears too small (%d bytes), skipping processing", filename, dest.stat().st_size)
failed_download.append(label)
continue
total_downloaded += 1
count = process_and_save(label, dest)
total_records += count
results.append((label, count))
# Brief pause between files to avoid hammering the network
time.sleep(0.5)
# Also count already-existing sightings from the 3 pre-downloaded months
pre_existing = ["august_2024", "january_2025", "june_2025"]
pre_total = 0
for slug in pre_existing:
p = SIGHTINGS_DIR / f"gan_mn_{slug}.csv"
if p.exists():
with open(p) as fh:
n = sum(1 for _ in fh) - 1
pre_total += n
log.info("Pre-existing %s: %d rows", p.name, n)
log.info("=" * 60)
log.info("Download summary:")
log.info(" Files newly processed: %d", total_downloaded)
log.info(" Records from new files: %d", total_records)
log.info(" Records from pre-existing 3 months: %d", pre_total)
log.info(" Grand total twilight events: %d", total_records + pre_total)
if failed_download:
log.warning(" Failed downloads: %s", ", ".join(failed_download))
log.info("Per-file breakdown:")
for label, count in results:
log.info(" %-15s %d", label, count)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,169 @@
"""
Processor for the Madrid Zenodo SQM dataset (Record 4633001).
This script reads data/raw/massive_sqm/SQM_evol.csv, representing 10 years of
continuous minute-by-minute Sky Quality Meter readings in Madrid, Spain.
We apply the BRIN inflection-point detection algorithm:
- Fajr: Moment of steepest SQM decline in the pre-dawn window.
- Isha: Moment SQM reaches within 0.5 mag of the dark-night baseline.
Output: data/raw/raw_sightings/madrid_sqm_10yr.csv
"""
import math
from pathlib import Path
from datetime import datetime, timezone, timedelta
import pandas as pd
import ephem
# Madrid UCM Observatory from paper metadata
LAT = 40.45
LNG = -3.73
ELEVATION_M = 640
UTC_OFFSET = 1.0 # Winter UTC+1, though data is in UTC so we just use this for local labeling
RAW_CSV = Path("data/raw/massive_sqm/SQM_evol.csv")
OUT_FILE = Path("data/raw/raw_sightings/madrid_sqm_10yr.csv")
MIN_DARK_SKY_MPSAS = 18.0
MAX_MOON_ALT = 5.0
def get_solar_altitude(utc_dt: datetime) -> float:
obs = ephem.Observer()
obs.lat = str(LAT)
obs.lon = str(LNG)
obs.elevation = ELEVATION_M
obs.pressure = 1013.25
obs.temp = 15.0
if utc_dt.tzinfo is not None:
utc_dt = utc_dt.replace(tzinfo=None)
obs.date = ephem.Date(utc_dt)
sun = ephem.Sun(obs)
return math.degrees(float(sun.alt))
def process_madrid_data():
if not RAW_CSV.exists():
print(f"File not found: {RAW_CSV}")
return
print("Loading Madrid SQM dataset. This may take a minute...")
# Format: Time(UTC);Magnitude
df = pd.read_csv(RAW_CSV, sep=';', usecols=['Time(UTC)', 'Magnitude'])
df['utc_dt'] = pd.to_datetime(df['Time(UTC)']).dt.tz_localize(timezone.utc)
df = df.rename(columns={'Magnitude': 'mpsas'})
# Filter bad data
df = df[df['mpsas'] > 0]
df = df.sort_values('utc_dt').reset_index(drop=True)
print(f"Loaded {len(df)} rows. Processing nights...")
# Identify unique local calendar dates in the dataset
df['local_date'] = (df['utc_dt'] + timedelta(hours=UTC_OFFSET)).dt.date
unique_dates = df['local_date'].unique()
records = []
obs = ephem.Observer()
obs.lat = str(LAT)
obs.lon = str(LNG)
obs.elevation = ELEVATION_M
obs.pressure = 1013.25
obs.temp = 15.0
print(f"Found {len(unique_dates)} nights to process.")
for local_date in unique_dates:
# Midnight UTC for the given local date
noon_utc = datetime(local_date.year, local_date.month, local_date.day, 12, 0, tzinfo=timezone.utc) - timedelta(hours=UTC_OFFSET)
obs.date = ephem.Date(noon_utc.replace(tzinfo=None))
try:
sunset_utc = obs.next_setting(ephem.Sun()).datetime().replace(tzinfo=timezone.utc)
sunrise_utc = obs.next_rising(ephem.Sun()).datetime().replace(tzinfo=timezone.utc)
except ephem.AlwaysUpError:
continue
except ephem.NeverUpError:
continue
# FAJR
window_start = sunrise_utc - timedelta(hours=5)
window_end = sunrise_utc - timedelta(minutes=10)
predawn = df[(df['utc_dt'] >= window_start) & (df['utc_dt'] <= window_end)].copy()
if len(predawn) >= 30:
predawn['sun_alt'] = [get_solar_altitude(dt) for dt in predawn['utc_dt']]
predawn = predawn[predawn['sun_alt'] < -5.0]
early = predawn[predawn['sun_alt'] < -18.0]
if early.empty:
early = predawn[predawn['sun_alt'] < -16.0]
if not early.empty and early['mpsas'].max() >= MIN_DARK_SKY_MPSAS:
predawn['mpsas_smooth'] = predawn['mpsas'].rolling(window=5, center=True, min_periods=3).mean()
predawn['dmpsas'] = predawn['mpsas_smooth'].diff()
active = predawn[(predawn['sun_alt'] >= -25.0) & (predawn['sun_alt'] <= -5.0)]
if len(active) >= 10:
steepest_idx = active['dmpsas'].idxmin()
if not pd.isna(steepest_idx):
dawn_row = predawn.loc[steepest_idx]
depression_angle = -dawn_row['sun_alt']
if 10.0 <= depression_angle <= 22.0:
local_time = dawn_row['utc_dt'] + timedelta(hours=UTC_OFFSET)
records.append({
"prayer": "fajr",
"date_local": local_time.strftime("%Y-%m-%d"),
"time_local": local_time.strftime("%H:%M"),
"utc_offset": UTC_OFFSET,
"lat": LAT,
"lng": LNG,
"elevation_m": ELEVATION_M,
"source": "Madrid Zenodo SQM (10-year)",
"notes": f"SQM inflection point Fajr detection; Madrid UCM Observatory; dark-sky baseline>={MIN_DARK_SKY_MPSAS}"
})
# ISHA
window_end_isha = sunset_utc + timedelta(hours=5)
evening = df[(df['utc_dt'] >= sunset_utc) & (df['utc_dt'] <= window_end_isha)].copy()
if len(evening) >= 30:
evening['sun_alt'] = [get_solar_altitude(dt) for dt in evening['utc_dt']]
evening = evening[evening['sun_alt'] < 0]
deep_night = evening[(evening['utc_dt'] >= sunset_utc + timedelta(hours=2)) &
(evening['utc_dt'] <= sunset_utc + timedelta(hours=6)) &
(evening['sun_alt'] < -18.0)]
if len(deep_night) >= 10:
dark_baseline = deep_night['mpsas'].median()
if dark_baseline >= MIN_DARK_SKY_MPSAS:
evening['mpsas_smooth'] = evening['mpsas'].rolling(window=5, center=True, min_periods=3).mean()
threshold = dark_baseline - 0.5
reached = evening[(evening['mpsas_smooth'] >= threshold) & (evening['sun_alt'] < -12.0)]
if not reached.empty:
isha_row = reached.iloc[0]
depression_angle = -isha_row['sun_alt']
if 12.0 <= depression_angle <= 22.0:
local_time = isha_row['utc_dt'] + timedelta(hours=UTC_OFFSET)
records.append({
"prayer": "isha",
"date_local": local_time.strftime("%Y-%m-%d"),
"time_local": local_time.strftime("%H:%M"),
"utc_offset": UTC_OFFSET,
"lat": LAT,
"lng": LNG,
"elevation_m": ELEVATION_M,
"source": "Madrid Zenodo SQM (10-year)",
"notes": f"SQM threshold Isha detection (Shafaq Abyad); dark-sky baseline={dark_baseline:.2f}"
})
out_df = pd.DataFrame(records)
OUT_FILE.parent.mkdir(parents=True, exist_ok=True)
out_df.to_csv(OUT_FILE, index=False)
print(f"Successfully processed {len(records)} Fajr/Isha observations to {OUT_FILE}.")
if __name__ == "__main__":
process_madrid_data()

282
src/surfrad_process.py Normal file
View file

@ -0,0 +1,282 @@
"""
SURFRAD Downloader and Twilight Extractor standalone.
Stations:
- dra: Desert Rock, Nevada (36.626°N, 116.018°W) arid, excellent visibility
- tbl: Table Mountain, Colorado (40.125°N, 105.237°W) high altitude (1689m)
- gwn: Goodwin Creek, Mississippi (34.255°N, 89.873°W) humid subtropical
File naming: {code}{YY}{DDD}.dat
YY = 2-digit year
DDD = day of year (001-366)
Column layout (SURFRAD standard format, v1):
0: year
1: jday (day of year)
2: month
3: day
4: hour (UTC)
5: min
6: dt (time difference in minutes from standard time)
7: zen (solar zenith angle, degrees)
8: dw_solar (downwelling shortwave, W/) GHI
9: dw_solar_sd
10: direct (direct normal irradiance, W/)
11: direct_sd
12: diffuse (diffuse horizontal, W/) DIF
13: diffuse_sd
14+: other measurements
Missing value: -9999.9
"""
import re
import math
import requests
import pandas as pd
import numpy as np
from datetime import datetime, timezone, timedelta, date
from pathlib import Path
BASE = Path("/Volumes/X9/Sites/acamarata/pray-calc-ml")
RAW_BSRN = BASE / "data/raw/bsrn"
RAW_SIGHTINGS = BASE / "data/raw/raw_sightings"
THRESHOLDS = [0.5, 1.0, 2.0, 5.0, 10.0]
SURFRAD_BASE = "https://gml.noaa.gov/aftp/data/radiation/surfrad"
STATIONS = {
"dra": {"name": "Desert Rock", "lat": 36.626, "lon": -116.018, "dir": "dra"},
"tbl": {"name": "TableMountain", "lat": 40.125, "lon": -105.237, "dir": "tbl"},
"gwn": {"name": "GoodwinCreek", "lat": 34.2547, "lon": -89.8729, "dir": "gwn"},
}
def download_file(url: str, dest: Path) -> bool:
if dest.exists() and dest.stat().st_size > 500:
return True
try:
r = requests.get(url, timeout=60)
r.raise_for_status()
dest.write_bytes(r.content)
return True
except Exception:
return False
def parse_surfrad_dat(content: bytes) -> pd.DataFrame:
"""
Parse SURFRAD .dat file.
Returns df with [times_utc, sza, ghi, dif].
"""
lines = content.decode("utf-8", errors="replace").splitlines()
# First 2 lines are headers; data starts line 2
records = []
for line in lines[2:]:
parts = line.split()
if len(parts) < 13:
continue
try:
yr = int(parts[0])
jday = int(parts[1])
hr = int(parts[4])
mn = int(parts[5])
zen = float(parts[7])
ghi = float(parts[8])
dif = float(parts[12])
dt = datetime(yr, 1, 1, hr, mn, tzinfo=timezone.utc) + timedelta(days=jday - 1)
records.append({
"times_utc": dt,
"sza": None if zen < -900 else zen,
"ghi": None if ghi < -900 else max(0.0, ghi),
"dif": None if dif < -900 else max(0.0, dif),
})
except (ValueError, IndexError):
continue
return pd.DataFrame(records)
def extract_events(df: pd.DataFrame, lat: float, lon: float,
station_code: str, station_name: str,
source: str) -> list[dict]:
"""Same logic as bsrn_process.py extract_events_from_df."""
events = []
df = df.copy()
df["times_utc"] = pd.to_datetime(df["times_utc"], utc=True, errors="coerce")
df = df.dropna(subset=["times_utc"]).sort_values("times_utc").reset_index(drop=True)
# Use diffuse as primary signal, fall back to GHI
df["irr"] = pd.to_numeric(df.get("dif", pd.Series()), errors="coerce")
if "ghi" in df.columns:
df["irr"] = df["irr"].combine_first(pd.to_numeric(df["ghi"], errors="coerce"))
df["sza"] = pd.to_numeric(df["sza"], errors="coerce")
df["date"] = df["times_utc"].dt.date
for day, grp in df.groupby("date"):
grp = grp.sort_values("times_utc").reset_index(drop=True)
irr = grp["irr"].values
sza = grp["sza"].values
ts = grp["times_utc"].values
n = len(irr)
if n < 10:
continue
for threshold in THRESHOLDS:
dawn_idx = None
for i in range(2, n):
if (not np.isnan(irr[i]) and irr[i] >= threshold
and not np.isnan(sza[i]) and sza[i] > 90.0):
prev_ok = all(
(np.isnan(v) or v < threshold)
for v in irr[max(0, i-3):i]
)
if prev_ok:
dawn_idx = i
break
dusk_idx = None
for i in range(n - 3, -1, -1):
if (not np.isnan(irr[i]) and irr[i] >= threshold
and not np.isnan(sza[i]) and sza[i] > 90.0):
after_ok = all(
(np.isnan(v) or v < threshold)
for v in irr[i+1:min(n, i+4)]
)
if after_ok:
dusk_idx = i
break
for idx, event_type in [(dawn_idx, "dawn"), (dusk_idx, "dusk")]:
if idx is None:
continue
sza_val = float(sza[idx])
if sza_val <= 90.0:
continue
dep = round(sza_val - 90.0, 4)
if dep < 0.3 or dep > 20.0:
continue
utc_ts = pd.Timestamp(ts[idx])
irr_val = float(irr[idx])
ghi_at = float(grp["ghi"].iloc[idx]) if "ghi" in grp.columns else float("nan")
dif_at = float(grp["dif"].iloc[idx]) if "dif" in grp.columns else float("nan")
events.append({
"date": str(day),
"event_type": event_type,
"utc_time": utc_ts.strftime("%H:%M:%S"),
"sza_deg": round(sza_val, 4),
"solar_depression_deg": dep,
"irr_wm2": round(irr_val, 2),
"ghi_wm2": round(ghi_at, 2) if not math.isnan(ghi_at) else None,
"dif_wm2": round(dif_at, 2) if not math.isnan(dif_at) else None,
"threshold_wm2": threshold,
"lat": lat,
"lon": lon,
"station_code": station_code,
"station_name": station_name,
"source": source,
})
return events
def process_station(code: str, meta: dict, years: list[int]) -> list[dict]:
print(f"\n {meta['name']} ({code}) lat={meta['lat']} lon={meta['lon']}")
dir_code = meta["dir"]
dest_root = RAW_BSRN / "surfrad" / dir_code
dest_root.mkdir(parents=True, exist_ok=True)
all_events = []
for year in years:
yy = str(year)[2:] # 2-digit year
# Build list of days
import calendar
days_in_year = 366 if calendar.isleap(year) else 365
dest_dir = dest_root / str(year)
dest_dir.mkdir(parents=True, exist_ok=True)
day_dfs = []
success_count = 0
for doy in range(1, days_in_year + 1):
fname = f"{dir_code}{yy}{doy:03d}.dat"
dest = dest_dir / fname
url = f"{SURFRAD_BASE}/{dir_code}/{year}/{fname}"
ok = download_file(url, dest)
if not ok:
continue
with open(dest, "rb") as f:
day_df = parse_surfrad_dat(f.read())
if not day_df.empty:
day_dfs.append(day_df)
success_count += 1
if not day_dfs:
print(f" {year}: 0 days downloaded")
continue
df_year = pd.concat(day_dfs, ignore_index=True)
evts = extract_events(df_year, meta["lat"], meta["lon"],
code, meta["name"], f"surfrad_{year}")
print(f" {year}: {success_count} days → {len(evts)} twilight events "
f"({len(df_year):,} 1-min records)")
all_events.extend(evts)
return all_events
def main():
print("SURFRAD Twilight Extractor")
years = [2018, 2019, 2020]
print(f"Years: {years}")
print(f"Stations: {list(STATIONS.keys())}")
all_events = []
for code, meta in STATIONS.items():
evts = process_station(code, meta, years)
all_events.extend(evts)
print(f"{len(evts)} total events for {code}")
# Save
out = RAW_SIGHTINGS / "surfrad_twilight.csv"
if all_events:
df = pd.DataFrame(all_events)
df = df.sort_values(["station_code", "date", "event_type", "threshold_wm2"])
df.to_csv(out, index=False)
print(f"\nSaved {len(df):,} rows → {out}")
prim = df[df["threshold_wm2"] == 1.0]
print(f"\nSUMMARY (1.0 W/m² threshold):")
print(f" Total: {len(prim):,} events")
for evt in ["dawn", "dusk"]:
sub = prim[prim["event_type"] == evt]
if sub.empty:
continue
d = sub["solar_depression_deg"]
print(f" {evt.upper()}: {len(sub):,} | "
f"mean={d.mean():.2f}° p10={d.quantile(0.1):.2f}° "
f"p90={d.quantile(0.9):.2f}°")
else:
print("No events extracted.")
# Merge into bsrn_all_twilight.csv
all_file = RAW_SIGHTINGS / "bsrn_all_twilight.csv"
if all_file.exists() and all_events:
df_existing = pd.read_csv(all_file)
# Remove old surfrad rows if any
df_existing = df_existing[~df_existing["source"].str.startswith("surfrad", na=False)]
df_new = pd.concat([df_existing, pd.DataFrame(all_events)], ignore_index=True)
df_new = df_new.sort_values(["station_code", "date", "event_type", "threshold_wm2"])
df_new.to_csv(all_file, index=False)
print(f"\nUpdated bsrn_all_twilight.csv → {len(df_new):,} total rows")
print("\nDone.")
if __name__ == "__main__":
main()

204
src/surfrad_process2.py Normal file
View file

@ -0,0 +1,204 @@
"""
SURFRAD processor reads already-downloaded files in DRA/ and TBL/ dirs.
Also downloads remaining missing days.
"""
import math
import requests
import pandas as pd
import numpy as np
import calendar
from datetime import datetime, timezone, timedelta
from pathlib import Path
BASE = Path("/Volumes/X9/Sites/acamarata/pray-calc-ml")
RAW_BSRN = BASE / "data/raw/bsrn"
RAW_SIGHTINGS = BASE / "data/raw/raw_sightings"
THRESHOLDS = [0.5, 1.0, 2.0, 5.0, 10.0]
SURFRAD_BASE = "https://gml.noaa.gov/aftp/data/radiation/surfrad"
STATIONS = {
"dra": {"name": "Desert Rock", "lat": 36.626, "lon": -116.018, "dir_code": "DRA", "file_prefix": "dra"},
"tbl": {"name": "TableMountain", "lat": 40.125, "lon": -105.237, "dir_code": "TBL", "file_prefix": "tbl"},
}
YEARS = [2018, 2019, 2020]
def download_file(url: str, dest: Path) -> bool:
if dest.exists() and dest.stat().st_size > 200:
return True
try:
r = requests.get(url, timeout=30)
if r.status_code != 200:
return False
dest.write_bytes(r.content)
return True
except Exception:
return False
def parse_surfrad_dat(fpath: Path) -> pd.DataFrame:
lines = fpath.read_bytes().decode("utf-8", errors="replace").splitlines()
records = []
for line in lines[2:]:
parts = line.split()
if len(parts) < 13:
continue
try:
yr = int(parts[0])
jday = int(parts[1])
hr = int(parts[4])
mn = int(parts[5])
zen = float(parts[7])
ghi = float(parts[8])
dif = float(parts[12])
dt = datetime(yr, 1, 1, hr, mn, tzinfo=timezone.utc) + timedelta(days=jday - 1)
records.append({
"times_utc": dt,
"sza": None if zen < -900 else zen,
"ghi": None if ghi < -900 else max(0.0, ghi),
"dif": None if dif < -900 else max(0.0, dif),
})
except (ValueError, IndexError):
continue
return pd.DataFrame(records)
def extract_events(df, lat, lon, code, name, source):
events = []
df = df.copy()
df["times_utc"] = pd.to_datetime(df["times_utc"], utc=True, errors="coerce")
df = df.dropna(subset=["times_utc"]).sort_values("times_utc").reset_index(drop=True)
df["irr"] = pd.to_numeric(df.get("dif", pd.Series(dtype=float)), errors="coerce")
if "ghi" in df.columns:
df["irr"] = df["irr"].combine_first(pd.to_numeric(df["ghi"], errors="coerce"))
df["sza"] = pd.to_numeric(df["sza"], errors="coerce")
df["date"] = df["times_utc"].dt.date
for day, grp in df.groupby("date"):
grp = grp.sort_values("times_utc").reset_index(drop=True)
irr = grp["irr"].values
sza = grp["sza"].values
ts = grp["times_utc"].values
n = len(irr)
if n < 10:
continue
for threshold in THRESHOLDS:
dawn_idx = None
for i in range(2, n):
if (not np.isnan(irr[i]) and irr[i] >= threshold
and not np.isnan(sza[i]) and sza[i] > 90.0
and all((np.isnan(v) or v < threshold) for v in irr[max(0,i-3):i])):
dawn_idx = i
break
dusk_idx = None
for i in range(n-3, -1, -1):
if (not np.isnan(irr[i]) and irr[i] >= threshold
and not np.isnan(sza[i]) and sza[i] > 90.0
and all((np.isnan(v) or v < threshold) for v in irr[i+1:min(n,i+4)])):
dusk_idx = i
break
for idx, etype in [(dawn_idx, "dawn"), (dusk_idx, "dusk")]:
if idx is None:
continue
sza_val = float(sza[idx])
if sza_val <= 90.0:
continue
dep = round(sza_val - 90.0, 4)
if dep < 0.3 or dep > 20.0:
continue
utc_ts = pd.Timestamp(ts[idx])
ghi_at = float(grp["ghi"].iloc[idx]) if "ghi" in grp.columns else float("nan")
dif_at = float(grp["dif"].iloc[idx]) if "dif" in grp.columns else float("nan")
events.append({
"date": str(day), "event_type": etype,
"utc_time": utc_ts.strftime("%H:%M:%S"),
"sza_deg": round(sza_val, 4),
"solar_depression_deg": dep,
"irr_wm2": round(float(irr[idx]), 2),
"ghi_wm2": round(ghi_at, 2) if not math.isnan(ghi_at) else None,
"dif_wm2": round(dif_at, 2) if not math.isnan(dif_at) else None,
"threshold_wm2": threshold,
"lat": lat, "lon": lon,
"station_code": code, "station_name": name, "source": source,
})
return events
def main():
print("SURFRAD Processor v2 — using cached downloads + filling gaps")
all_events = []
for code, meta in STATIONS.items():
print(f"\n{meta['name']} ({code})")
dir_code = meta["dir_code"]
prefix = meta["file_prefix"]
dest_root = RAW_BSRN / "surfrad" / dir_code
for year in YEARS:
yy = str(year)[2:]
days_in_year = 366 if calendar.isleap(year) else 365
dest_dir = dest_root / str(year)
dest_dir.mkdir(parents=True, exist_ok=True)
# Download any missing days
missing = 0
for doy in range(1, days_in_year + 1):
fname = f"{prefix}{yy}{doy:03d}.dat"
dest = dest_dir / fname
if dest.exists() and dest.stat().st_size > 200:
continue
url = f"{SURFRAD_BASE}/{prefix}/{year}/{fname}"
if download_file(url, dest):
missing += 1
# Parse all available files
day_dfs = []
for fpath in sorted(dest_dir.glob(f"{prefix}{yy}*.dat")):
day_df = parse_surfrad_dat(fpath)
if not day_df.empty:
day_dfs.append(day_df)
if not day_dfs:
print(f" {year}: no data files found")
continue
df_year = pd.concat(day_dfs, ignore_index=True)
evts = extract_events(df_year, meta["lat"], meta["lon"],
code, meta["name"], f"surfrad_{year}")
print(f" {year}: {len(day_dfs)} days, {len(df_year):,} records → {len(evts)} events (downloaded {missing} new files)")
all_events.extend(evts)
out = RAW_SIGHTINGS / "surfrad_twilight.csv"
if all_events:
df = pd.DataFrame(all_events).sort_values(
["station_code", "date", "event_type", "threshold_wm2"])
df.to_csv(out, index=False)
print(f"\nSaved {len(df):,} rows → {out}")
prim = df[df["threshold_wm2"] == 1.0]
print(f"\nSummary at 1 W/m²: {len(prim):,} events")
for etype in ["dawn", "dusk"]:
sub = prim[prim["event_type"] == etype]
if not sub.empty:
d = sub["solar_depression_deg"]
print(f" {etype}: {len(sub):,} | mean={d.mean():.2f}° "
f"p10={d.quantile(0.1):.2f}° p90={d.quantile(0.9):.2f}°")
# Update combined file
combined = RAW_SIGHTINGS / "bsrn_all_twilight.csv"
if combined.exists():
df_all = pd.read_csv(combined)
df_all = df_all[~df_all["source"].str.startswith("surfrad", na=False)]
df_all = pd.concat([df_all, pd.DataFrame(all_events)], ignore_index=True)
df_all = df_all.sort_values(["station_code", "date", "event_type", "threshold_wm2"])
df_all.to_csv(combined, index=False)
print(f"Updated bsrn_all_twilight.csv → {len(df_all):,} total rows")
else:
print("No events.")
print("\nDone.")
if __name__ == "__main__":
main()