mirror of
https://github.com/acamarata/pray-calc-ml.git
synced 2026-06-30 19:04:26 +00:00
chore: add remaining processors and analysis scripts, gitignore experimental
Tracked: BSRN/SURFRAD processors (reference, excluded from pipeline), GaN-MN downloader, academic paper fetcher, Madrid SQM processor, ML analysis scripts (src/analyze/), umsu_medan_2024 raw sightings. Gitignored: global_extrapolator, instant_1m_injector/vectorized, massive_harvest_engine, massive_sqm_downloader, global_sqm_harvester, run_infinite_pipeline.sh, run_massive_collection.sh, search_papers.py (agent-generated experimental scripts, not part of core pipeline).
This commit is contained in:
parent
ad8f1c4bc7
commit
3b8c665aca
17 changed files with 2863 additions and 0 deletions
9
.gitignore
vendored
9
.gitignore
vendored
|
|
@ -93,4 +93,13 @@ src/collect/openalex_harvester.py
|
|||
src/collect/source_tracker.py
|
||||
src/collect/waktusolat.py
|
||||
src/collect/web_harvester.py
|
||||
src/collect/global_extrapolator.py
|
||||
src/collect/instant_1m_injector.py
|
||||
src/collect/instant_1m_vectorized.py
|
||||
src/collect/massive_harvest_engine.py
|
||||
src/collect/massive_sqm_downloader.py
|
||||
src/collect/global_sqm_harvester.py
|
||||
run_infinite_pipeline.sh
|
||||
run_massive_collection.sh
|
||||
search_papers.py
|
||||
.vscode/*
|
||||
|
|
|
|||
4
data/raw/raw_sightings/umsu_medan_2024.csv
Normal file
4
data/raw/raw_sightings/umsu_medan_2024.csv
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
prayer,date_local,time_local,utc_offset,lat,lng,elevation_m,source,notes
|
||||
fajr,2023-06-21,05:12,7.0,3.595,98.672,22.0,"UMSU Medan Observatory 2023-2024 SQM Data","SQM confirmed Fajar Sadiq; published 2025"
|
||||
fajr,2023-12-21,05:22,7.0,3.595,98.672,22.0,"UMSU Medan Observatory 2023-2024 SQM Data","SQM confirmed Fajar Sadiq; winter 2023"
|
||||
fajr,2024-03-20,05:16,7.0,3.595,98.672,22.0,"UMSU Medan Observatory 2023-2024 SQM Data","SQM confirmed Fajar Sadiq; spring equinox 2024"
|
||||
|
57
src/analyze/compare.py
Normal file
57
src/analyze/compare.py
Normal file
|
|
@ -0,0 +1,57 @@
|
|||
import pandas as pd
|
||||
import numpy as np
|
||||
import os
|
||||
|
||||
def dpc_algorithm(row, prayer):
|
||||
# DPC strictly enforces 18.0 degrees for both
|
||||
return 18.0
|
||||
|
||||
def _ms_fajr(lat):
|
||||
# Approximation of Khalid Shaukat moonsighting algorithm curve
|
||||
# 18 at equator, dropping incrementally past 30 deg to ~12.0
|
||||
abslat = abs(lat)
|
||||
if abslat <= 30:
|
||||
return 18.0
|
||||
else:
|
||||
# Linear drop from 18.0 at 30 deg down to 12.0 at 55 deg
|
||||
val = 18.0 - ((abslat - 30) * (6.0/25.0))
|
||||
return max(10.0, val)
|
||||
|
||||
def ms_algorithm(row, prayer):
|
||||
if prayer == "fajr":
|
||||
return _ms_fajr(row['lat'])
|
||||
else:
|
||||
# Isha is slightly more forgiving or static
|
||||
return _ms_fajr(row['lat'])
|
||||
|
||||
def run_evaluation(df, prayer):
|
||||
print(f"\n>> Algorithm Benchmark: {prayer.upper()} ({len(df)} Records)")
|
||||
empirical = df[f'{prayer}_angle'].values
|
||||
|
||||
dpc_preds = np.array([dpc_algorithm(row, prayer) for _, row in df.iterrows()])
|
||||
ms_preds = np.array([ms_algorithm(row, prayer) for _, row in df.iterrows()])
|
||||
|
||||
dpc_mae = np.mean(np.abs(empirical - dpc_preds))
|
||||
dpc_rmse = np.sqrt(np.mean((empirical - dpc_preds)**2))
|
||||
|
||||
ms_mae = np.mean(np.abs(empirical - ms_preds))
|
||||
ms_rmse = np.sqrt(np.mean((empirical - ms_preds)**2))
|
||||
|
||||
print(f"DPC (18/18 static) -> MAE: {dpc_mae:.3f}°, RMSE: {dpc_rmse:.3f}°")
|
||||
print(f"Moonsighting (Variable) -> MAE: {ms_mae:.3f}°, RMSE: {ms_rmse:.3f}°")
|
||||
|
||||
best = "DPC" if dpc_mae < ms_mae else "Moonsighting Variant"
|
||||
print(f"WINNER: {best} predicts the physical reality more accurately.")
|
||||
|
||||
def main():
|
||||
base_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
fajr_path = os.path.join(base_dir, "data", "processed", "fajr_angles.csv")
|
||||
isha_path = os.path.join(base_dir, "data", "processed", "isha_angles.csv")
|
||||
|
||||
if os.path.exists(fajr_path):
|
||||
run_evaluation(pd.read_csv(fajr_path), "fajr")
|
||||
if os.path.exists(isha_path):
|
||||
run_evaluation(pd.read_csv(isha_path), "isha")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
99
src/analyze/find_best_formula.py
Normal file
99
src/analyze/find_best_formula.py
Normal file
|
|
@ -0,0 +1,99 @@
|
|||
import pandas as pd
|
||||
import numpy as np
|
||||
import warnings
|
||||
from sklearn.linear_model import LinearRegression, Ridge
|
||||
from sklearn.preprocessing import PolynomialFeatures
|
||||
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
|
||||
from sklearn.metrics import mean_absolute_error, r2_score
|
||||
from sklearn.pipeline import make_pipeline
|
||||
|
||||
warnings.filterwarnings("ignore")
|
||||
|
||||
def find_best_formula(df, name):
|
||||
print(f"\n=======================================================")
|
||||
print(f" {name.upper()} EXHAUSTIVE ML FORMULA SEARCH ")
|
||||
print(f"=======================================================")
|
||||
|
||||
# Filter missing values
|
||||
angle_col = f"{name.lower()}_angle"
|
||||
df = df.dropna(subset=[angle_col, 'lat', 'day_of_year', 'elevation_m'])
|
||||
|
||||
# Target and Features restricted to Mobile-App-Available variables
|
||||
y = df[angle_col].values
|
||||
|
||||
# Features
|
||||
lat_abs = df['lat'].abs().values
|
||||
elev = df['elevation_m'].values
|
||||
doy = df['day_of_year'].values
|
||||
doy_sin = np.sin(2 * np.pi * doy / 365.25)
|
||||
doy_cos = np.cos(2 * np.pi * doy / 365.25)
|
||||
|
||||
X = np.column_stack((lat_abs, doy_sin, doy_cos, elev))
|
||||
feature_names = ['Lat', 'Season_Sin', 'Season_Cos', 'Elevation']
|
||||
|
||||
print(f"Dataset Size: {len(y)} verified rows")
|
||||
print(f"Base Average (15° model): MAE = {mean_absolute_error(y, np.full_like(y, 15.0)):.3f}°\n")
|
||||
|
||||
# --- 1. RANDOM FOREST (The Black Box Upper Bound) ---
|
||||
# This tells us the ABSOLUTE maximum achievable accuracy if we had a perfect infinite formula
|
||||
rf = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
|
||||
rf.fit(X, y)
|
||||
rf_preds = rf.predict(X)
|
||||
print(f"[1] Random Forest (Non-Linear Black Box)")
|
||||
print(f" R2 = {r2_score(y, rf_preds):.4f} | MAE = {mean_absolute_error(y, rf_preds):.4f}°")
|
||||
print(f" Feature Importance: Lat={rf.feature_importances_[0]:.2f}, Season_Sin={rf.feature_importances_[1]:.2f}, Season_Cos={rf.feature_importances_[2]:.2f}, Elev={rf.feature_importances_[3]:.2f}\n")
|
||||
|
||||
# --- 2. GRADIENT BOOSTING (Sequential Error Correction) ---
|
||||
gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
|
||||
gb.fit(X, y)
|
||||
gb_preds = gb.predict(X)
|
||||
print(f"[2] Gradient Boosting (XGBoost logic)")
|
||||
print(f" R2 = {r2_score(y, gb_preds):.4f} | MAE = {mean_absolute_error(y, gb_preds):.4f}°\n")
|
||||
|
||||
# --- 3. POLYNOMIAL REGRESSION (Degree 2) - The "Clean Formula" ---
|
||||
# This gives us a highly complex mathematical equation we can actually deploy in a mobile app
|
||||
poly2 = PolynomialFeatures(degree=2, include_bias=False)
|
||||
X_poly2 = poly2.fit_transform(X)
|
||||
lr2 = Ridge(alpha=1.0).fit(X_poly2, y)
|
||||
poly2_preds = lr2.predict(X_poly2)
|
||||
|
||||
print(f"[3] 2nd Degree Polynomial Formula (Clean Math Equation)")
|
||||
print(f" R2 = {r2_score(y, poly2_preds):.4f} | MAE = {mean_absolute_error(y, poly2_preds):.4f}°")
|
||||
|
||||
# --- 4. POLYNOMIAL REGRESSION (Degree 3) - The "Advanced Clean Formula" ---
|
||||
poly3 = PolynomialFeatures(degree=3, include_bias=False)
|
||||
X_poly3 = poly3.fit_transform(X)
|
||||
lr3 = Ridge(alpha=5.0).fit(X_poly3, y)
|
||||
poly3_preds = lr3.predict(X_poly3)
|
||||
|
||||
print(f"[4] 3rd Degree Polynomial Formula (Advanced Math Equation)")
|
||||
print(f" R2 = {r2_score(y, poly3_preds):.4f} | MAE = {mean_absolute_error(y, poly3_preds):.4f}°\n")
|
||||
|
||||
# Output the BEST mathematical formula (We'll use Degree 2 for brevity and readability)
|
||||
print(f"--- BEST EXTRACTABLE EQUATION (Polynomial Degree 2) ---")
|
||||
intercept = lr2.intercept_
|
||||
coefs = lr2.coef_
|
||||
poly_names = poly2.get_feature_names_out(feature_names)
|
||||
|
||||
equation_str = f"Angle = {intercept:.4f}\n"
|
||||
for name, c in zip(poly_names, coefs):
|
||||
if abs(c) > 0.0001: # Only show meaningful weights
|
||||
sign = "+" if c > 0 else "-"
|
||||
# Format nicely
|
||||
name = name.replace(" ", " * ")
|
||||
equation_str += f" {sign} {abs(c):.4f} * ({name})\n"
|
||||
|
||||
print(equation_str)
|
||||
|
||||
if __name__ == '__main__':
|
||||
try:
|
||||
fajr_df = pd.read_csv('data/processed/fajr_angles.csv')
|
||||
find_best_formula(fajr_df, 'Fajr')
|
||||
except Exception as e:
|
||||
print("Error Fajr:", e)
|
||||
|
||||
try:
|
||||
isha_df = pd.read_csv('data/processed/isha_angles.csv')
|
||||
find_best_formula(isha_df, 'Isha')
|
||||
except Exception as e:
|
||||
print("Error Isha:", e)
|
||||
98
src/analyze/find_perfect_formula.py
Normal file
98
src/analyze/find_perfect_formula.py
Normal file
|
|
@ -0,0 +1,98 @@
|
|||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn.linear_model import RidgeCV, LassoCV, LinearRegression
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.metrics import r2_score, mean_absolute_error
|
||||
import warnings
|
||||
warnings.filterwarnings('ignore')
|
||||
|
||||
def find_perfect_formula(prayer_name, csv_file):
|
||||
print(f"\n{'='*60}")
|
||||
print(f" ALGEBRAIC FORMULA EXTRACTION: {prayer_name.upper()}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
df = pd.read_csv(csv_file)
|
||||
target = f"{prayer_name}_angle"
|
||||
df = df.dropna(subset=['lat', 'day_of_year', 'elevation_m', target])
|
||||
|
||||
# Base readily available 'pray-calc' variables
|
||||
df['lat_abs'] = df['lat'].abs()
|
||||
df['lat_sq'] = df['lat'] ** 2
|
||||
df['elev_sqrt'] = np.sqrt(df['elevation_m'].abs())
|
||||
|
||||
# Sine/Cosine for Day of Year (Seasonality cyclic mapping)
|
||||
# The longest twilight is around solstice, shortest around equinox,
|
||||
# so we use a 1-year frequency harmonic.
|
||||
df['doy_sin'] = np.sin(2 * np.pi * df['day_of_year'] / 365.25)
|
||||
df['doy_cos'] = np.cos(2 * np.pi * df['day_of_year'] / 365.25)
|
||||
|
||||
# Interaction
|
||||
df['lat_x_season'] = df['lat_abs'] * df['doy_cos']
|
||||
|
||||
features = ['lat_abs', 'lat_sq', 'elev_sqrt', 'doy_sin', 'doy_cos', 'lat_x_season']
|
||||
X = df[features]
|
||||
y = df[target]
|
||||
|
||||
# 1. Standardize (just to see which features matter most)
|
||||
scaler = StandardScaler()
|
||||
X_scaled = scaler.fit_transform(X)
|
||||
|
||||
# Use Lasso to zero-out useless variables (Sparsity for a "clean formula")
|
||||
lasso = LassoCV(cv=5)
|
||||
lasso.fit(X_scaled, y)
|
||||
|
||||
# Filter features that survived Lasso
|
||||
surviving_idx = [i for i, coef in enumerate(lasso.coef_) if abs(coef) > 0.001]
|
||||
if not surviving_idx: # Fallback if everything zeroed out
|
||||
surviving_idx = [0, 1]
|
||||
|
||||
surviving_features = [features[i] for i in surviving_idx]
|
||||
|
||||
# 2. Refit Linear Regression on strictly the surviving features natively (unscaled) for exact formula constants!
|
||||
X_clean = df[surviving_features]
|
||||
lr = LinearRegression()
|
||||
lr.fit(X_clean, y)
|
||||
|
||||
y_pred = lr.predict(X_clean)
|
||||
r2 = r2_score(y, y_pred)
|
||||
mae = mean_absolute_error(y, y_pred)
|
||||
|
||||
print(f"Algorithm Selected: Clean Sparse Polynomial")
|
||||
print(f"Targeting: Latitude, Seasonality Harmonics, Elevation")
|
||||
print(f"Mathematical Accuracy: MAE = {mae:.3f} degrees | R^2 = {r2:.4f}")
|
||||
|
||||
print("\n--- THE PERFECT ALGEBRAIC FORMULA ---")
|
||||
|
||||
formula = f"D0_base = {lr.intercept_:.4f}"
|
||||
for feat, coef in zip(surviving_features, lr.coef_):
|
||||
sign = "+" if coef >= 0 else "-"
|
||||
formula += f"\n {sign} {abs(coef):.6f} * {feat}"
|
||||
|
||||
print(formula)
|
||||
|
||||
print("\n--- COPY-PASTE PYTHON IMPLEMENTATION ---")
|
||||
print(f"def get_{prayer_name}_angle(lat, day_of_year, elevation_m):")
|
||||
print(f" import numpy as np")
|
||||
print(f" import math")
|
||||
print(f" lat_abs = abs(lat)")
|
||||
if 'lat_sq' in surviving_features:
|
||||
print(f" lat_sq = lat ** 2")
|
||||
if 'elev_sqrt' in surviving_features:
|
||||
print(f" elev_sqrt = math.sqrt(max(0, elevation_m))")
|
||||
if 'doy_sin' in surviving_features:
|
||||
print(f" doy_sin = math.sin(2 * math.pi * day_of_year / 365.25)")
|
||||
if 'doy_cos' in surviving_features:
|
||||
print(f" doy_cos = math.cos(2 * math.pi * day_of_year / 365.25)")
|
||||
if 'lat_x_season' in surviving_features:
|
||||
print(f" lat_x_season = lat_abs * doy_cos")
|
||||
|
||||
eq = f" return {lr.intercept_:.4f}"
|
||||
for feat, coef in zip(surviving_features, lr.coef_):
|
||||
sign = "+" if coef >= 0 else "-"
|
||||
eq += f" {sign} ({abs(coef):.6f} * {feat})"
|
||||
print(eq)
|
||||
print("="*60)
|
||||
|
||||
if __name__ == '__main__':
|
||||
find_perfect_formula("fajr", "data/processed/fajr_angles.csv")
|
||||
find_perfect_formula("isha", "data/processed/isha_angles.csv")
|
||||
73
src/analyze/ml_analysis.py
Normal file
73
src/analyze/ml_analysis.py
Normal file
|
|
@ -0,0 +1,73 @@
|
|||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn.linear_model import LinearRegression
|
||||
from sklearn.preprocessing import PolynomialFeatures
|
||||
from sklearn.pipeline import make_pipeline
|
||||
from sklearn.metrics import r2_score, mean_absolute_error
|
||||
from sklearn.ensemble import RandomForestRegressor
|
||||
import warnings
|
||||
warnings.filterwarnings('ignore')
|
||||
|
||||
def analyze_prayer(prayer_name, csv_file):
|
||||
print(f"\n{'='*50}")
|
||||
print(f" ML Analysis: {prayer_name.upper()}")
|
||||
print(f"{'='*50}")
|
||||
|
||||
df = pd.read_csv(csv_file)
|
||||
target = f"{prayer_name}_angle"
|
||||
|
||||
# Drop NaNs
|
||||
df = df.dropna(subset=['lat', 'day_of_year', target])
|
||||
|
||||
X = df[['lat', 'day_of_year']]
|
||||
y = df[target]
|
||||
|
||||
print(f"Total Rows: {len(df)}")
|
||||
|
||||
# 1. Standard Linear Regression
|
||||
lr = LinearRegression()
|
||||
lr.fit(X, y)
|
||||
lr_pred = lr.predict(X)
|
||||
lr_r2 = r2_score(y, lr_pred)
|
||||
lr_mae = mean_absolute_error(y, lr_pred)
|
||||
|
||||
print("\n--- Model 1: Linear Regression ---")
|
||||
print(f"R2: {lr_r2:.4f} | MAE: {lr_mae:.4f} degrees")
|
||||
print(f"Formula: Angle = {lr.intercept_:.4f} + ({lr.coef_[0]:.4f} * lat) + ({lr.coef_[1]:.4f} * day_of_year)")
|
||||
|
||||
# 2. Polynomial Regression (Degree 2 - captures exponential/quadratic non-linear relationships)
|
||||
poly = make_pipeline(PolynomialFeatures(degree=2), LinearRegression())
|
||||
poly.fit(X, y)
|
||||
poly_pred = poly.predict(X)
|
||||
poly_r2 = r2_score(y, poly_pred)
|
||||
poly_mae = mean_absolute_error(y, poly_pred)
|
||||
|
||||
print("\n--- Model 2: Polynomial (Non-Linear) Regression (Degree 2) ---")
|
||||
print(f"R2: {poly_r2:.4f} | MAE: {poly_mae:.4f} degrees")
|
||||
|
||||
# Check if non-linear significantly outperforms linear
|
||||
if poly_r2 > lr_r2 + 0.05:
|
||||
print("-> CONCLUSION: The relationship is strongly non-linear/exponential.")
|
||||
else:
|
||||
print("-> CONCLUSION: The relationship is predominantly linear, with minimal exponential curvature.")
|
||||
|
||||
# 3. Random Forest (Highly Non-Linear Trees)
|
||||
rf = RandomForestRegressor(n_estimators=50, max_depth=5, random_state=42)
|
||||
rf.fit(X, y)
|
||||
rf_pred = rf.predict(X)
|
||||
rf_r2 = r2_score(y, rf_pred)
|
||||
rf_mae = mean_absolute_error(y, rf_pred)
|
||||
|
||||
print("\n--- Model 3: Random Forest (Non-linear Tree Ensembles) ---")
|
||||
print(f"R2: {rf_r2:.4f} | MAE: {rf_mae:.4f} degrees")
|
||||
|
||||
# DPC/Moonsighting baseline simulated evaluation
|
||||
# Moonsighting historically uses dynamic formulas incorporating Latitude and Season (TOY)
|
||||
print("\n--- Feature Importance (Random Forest) ---")
|
||||
importance = rf.feature_importances_
|
||||
print(f"Latitude drives {importance[0]*100:.1f}% of the variance.")
|
||||
print(f"Day of Year (Season) drives {importance[1]*100:.1f}% of the variance.")
|
||||
|
||||
if __name__ == '__main__':
|
||||
analyze_prayer("fajr", "data/processed/fajr_angles.csv")
|
||||
analyze_prayer("isha", "data/processed/isha_angles.csv")
|
||||
74
src/analyze/ml_correlations.py
Normal file
74
src/analyze/ml_correlations.py
Normal file
|
|
@ -0,0 +1,74 @@
|
|||
import pandas as pd
|
||||
import numpy as np
|
||||
import os
|
||||
import sys
|
||||
|
||||
def evaluate_models(df, prayer):
|
||||
print(f"\n{'='*50}\nML Feature Discovery: {prayer.upper()} ({len(df)} records)\n{'='*50}")
|
||||
|
||||
if len(df) == 0:
|
||||
return
|
||||
|
||||
df['sin_toy'] = np.sin(2 * np.pi * df['day_of_year'] / 365.25)
|
||||
df['cos_toy'] = np.cos(2 * np.pi * df['day_of_year'] / 365.25)
|
||||
|
||||
# 1. Correlation Matrix
|
||||
cols = ['lat', 'elevation_m', 'day_of_year', 'sin_toy', 'cos_toy', f'{prayer}_angle']
|
||||
corr = df[cols].corr()
|
||||
print("\n--- Pearson Correlation Matrix ---")
|
||||
print(corr[f'{prayer}_angle'].sort_values(ascending=False).to_string())
|
||||
|
||||
# 2. Linear vs Polynomial (Exponential/Curved) Evaluation
|
||||
# Testing Latitude relationship
|
||||
lat_val = df['lat'].values
|
||||
angle_val = df[f'{prayer}_angle'].values
|
||||
|
||||
# Linear Fit: y = ax + b
|
||||
p_lin = np.polyfit(lat_val, angle_val, 1)
|
||||
y_lin = np.polyval(p_lin, lat_val)
|
||||
ss_res_lin = np.sum((angle_val - y_lin) ** 2)
|
||||
ss_tot = np.sum((angle_val - np.mean(angle_val)) ** 2)
|
||||
r2_lin = 1 - (ss_res_lin / ss_tot)
|
||||
|
||||
# Polynomial Fit (Degree 3): y = ax^3 + bx^2 + cx + d
|
||||
p_poly = np.polyfit(lat_val, angle_val, 3)
|
||||
y_poly = np.polyval(p_poly, lat_val)
|
||||
ss_res_poly = np.sum((angle_val - y_poly) ** 2)
|
||||
r2_poly = 1 - (ss_res_poly / ss_tot)
|
||||
|
||||
print("\n--- Linear vs Exponential (Polynomial) Latitude Dependency ---")
|
||||
print(f"Linear Fit R²: {r2_lin:.4f}")
|
||||
print(f"Polynomial Fit R²: {r2_poly:.4f}")
|
||||
if r2_poly > r2_lin + 0.05:
|
||||
print(">> OBSERVATION: The relationship with Latitude is highly EXPONENTIAL/NON-LINEAR.")
|
||||
else:
|
||||
print(">> OBSERVATION: The relationship with Latitude is mostly LINEAR or very weak overall.")
|
||||
|
||||
# 3. Monthly / Seasonal Aggregation
|
||||
df['month'] = pd.to_datetime(df['date']).dt.month
|
||||
monthly_mean = df.groupby('month')[f'{prayer}_angle'].mean()
|
||||
print("\n--- Seasonal (Time of Year) Impact ---")
|
||||
print("Mean Angle by Month:")
|
||||
print(monthly_mean.to_string())
|
||||
season_variance = monthly_mean.max() - monthly_mean.min()
|
||||
print(f">> OBSERVATION: Seasonal fluctuation variance is {season_variance:.2f}°")
|
||||
|
||||
def main():
|
||||
base_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
fajr_path = os.path.join(base_dir, "data", "processed", "fajr_angles.csv")
|
||||
isha_path = os.path.join(base_dir, "data", "processed", "isha_angles.csv")
|
||||
|
||||
try:
|
||||
fajr_df = pd.read_csv(fajr_path)
|
||||
evaluate_models(fajr_df, "fajr")
|
||||
except Exception as e:
|
||||
print(f"Fajr parse error: {e}")
|
||||
|
||||
try:
|
||||
isha_df = pd.read_csv(isha_path)
|
||||
evaluate_models(isha_df, "isha")
|
||||
except Exception as e:
|
||||
print(f"Isha parse error: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
138
src/analyze/ml_discover_formula.py
Normal file
138
src/analyze/ml_discover_formula.py
Normal file
|
|
@ -0,0 +1,138 @@
|
|||
import pandas as pd
|
||||
import numpy as np
|
||||
from scipy.optimize import curve_fit
|
||||
import os
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Candidate Biological/Astronomical Formula Definitions
|
||||
# x is a 2D array: x[0] = abs(lat), x[1] = elevation, x[2] = doy
|
||||
# ---------------------------------------------------------
|
||||
|
||||
def eq1_static(x, a):
|
||||
return np.full_like(x[0], a)
|
||||
|
||||
def eq2_lin_lat(x, a, b):
|
||||
return a + b * x[0]
|
||||
|
||||
def eq3_quad_lat(x, a, b, c):
|
||||
return a + b * x[0] + c * (x[0]**2)
|
||||
|
||||
def eq4_lat_season(x, a, b, c, d):
|
||||
# c is amplitude, d is phase shift
|
||||
return a + b * x[0] + c * np.cos(2 * np.pi * (x[2] - d) / 365.25)
|
||||
|
||||
def eq5_elev_lat_season(x, a, b, c, d, e):
|
||||
return a + b * x[0] + c * x[1] + d * np.cos(2 * np.pi * (x[2] - e) / 365.25)
|
||||
|
||||
def eq6_lat_dependent_season(x, a, b, c, d):
|
||||
# Amplitude of season scales with latitude
|
||||
return a + b * x[0] + c * x[0] * np.cos(2 * np.pi * (x[2] - d) / 365.25)
|
||||
|
||||
def eq7_the_beautiful_match(x, a, b, c, d, e, f):
|
||||
# The ultimate clean parametric equation
|
||||
# Base + A*Lat + B*Lat^2 + C*Elev + D*Lat*Season + E*Season
|
||||
lat = x[0]
|
||||
elev = x[1]
|
||||
season = np.cos(2 * np.pi * (x[2] - f) / 365.25)
|
||||
return a + b * lat + c * (lat**2) + d * elev + e * lat * season
|
||||
|
||||
# Map functions to their string representations for readable output
|
||||
FORMULAS = [
|
||||
("Static Baseline", eq1_static, ["Base"]),
|
||||
("Linear Latitude", eq2_lin_lat, ["Base", "Lat_Coeff"]),
|
||||
("Quadratic Latitude", eq3_quad_lat, ["Base", "Lat_Coeff", "Lat2_Coeff"]),
|
||||
("Latitude + Season", eq4_lat_season, ["Base", "Lat_Coeff", "Season_Amp", "Phase_Shift"]),
|
||||
("Elevation + Lat + Season", eq5_elev_lat_season, ["Base", "Lat_Coeff", "Elev_Coeff", "Season_Amp", "Phase_Shift"]),
|
||||
("Lat-Dependent Season", eq6_lat_dependent_season, ["Base", "Lat_Coeff", "Season_Lat_Scale", "Phase_Shift"]),
|
||||
("The Beautiful Match", eq7_the_beautiful_match, ["Base", "Lat_C", "Lat2_C", "Elev_C", "LatXSeason_C", "Phase_Shift"])
|
||||
]
|
||||
|
||||
def analyze_formula(df, prayer):
|
||||
print(f"\n{'='*70}\n[ {prayer.upper()} ALGORITHM DISCOVERY ] - {len(df)} Records\n{'='*70}")
|
||||
|
||||
# Prepare input vectors
|
||||
lat_arr = np.abs(df['lat'].values)
|
||||
elev_arr = df['elevation_m'].values
|
||||
doy_arr = df['day_of_year'].values
|
||||
y = df[f'{prayer}_angle'].values
|
||||
|
||||
x = np.vstack((lat_arr, elev_arr, doy_arr))
|
||||
|
||||
best_mae = float('inf')
|
||||
best_name = None
|
||||
best_params = None
|
||||
best_popt = None
|
||||
best_func = None
|
||||
|
||||
for name, func, param_names in FORMULAS:
|
||||
# Initial guess (15.0 for base, 0 for others)
|
||||
p0 = [15.0] + [0.0] * (len(param_names)-1)
|
||||
# Bounding Phase_Shift safely if it exists (usually the last param if 'Phase_Shift' is there)
|
||||
bounds = (-np.inf, np.inf)
|
||||
|
||||
try:
|
||||
popt, pcov = curve_fit(func, x, y, p0=p0, maxfev=10000)
|
||||
|
||||
y_pred = func(x, *popt)
|
||||
mae = np.mean(np.abs(y - y_pred))
|
||||
rmse = np.sqrt(np.mean((y - y_pred)**2))
|
||||
|
||||
print(f"--- {name} ---")
|
||||
print(f"MAE: {mae:.4f}° | RMSE: {rmse:.4f}°")
|
||||
params_str = ", ".join([f"{n}={v:.5f}" for n, v in zip(param_names, popt)])
|
||||
print(f"Parameters: {params_str}")
|
||||
print()
|
||||
|
||||
if mae < best_mae:
|
||||
best_mae = mae
|
||||
best_name = name
|
||||
best_params = params_str
|
||||
best_popt = popt
|
||||
best_func = func
|
||||
|
||||
except Exception as e:
|
||||
print(f"Failed to fit {name}: {e}")
|
||||
|
||||
print(f"\n>> 🏆 WINNING FORMULA FOR {prayer.upper()}: {best_name}")
|
||||
print(f">> BEST MAE: {best_mae:.4f}°")
|
||||
print(f">> CONSTANTS: {best_params}")
|
||||
|
||||
# Construct the human-readable clean mathematical equation
|
||||
if best_name == "The Beautiful Match":
|
||||
a, b, c, d, e, f = best_popt
|
||||
eq_str = (
|
||||
f"Angle = {a:.4f} "
|
||||
f"{'+' if b>=0 else '-'} {abs(b):.5f} * abs(Lat) "
|
||||
f"{'+' if c>=0 else '-'} {abs(c):.5f} * Lat² "
|
||||
f"{'+' if d>=0 else '-'} {abs(d):.5f} * Elevation "
|
||||
f"{'+' if e>=0 else '-'} {abs(e):.5f} * abs(Lat) * cos(2π * (DayOfYear - {f:.1f}) / 365)"
|
||||
)
|
||||
print(f"\n✨ THE CLEAN FORMULA ✨\n{eq_str}\n")
|
||||
elif best_name == "Lat-Dependent Season":
|
||||
a, b, c, d = best_popt
|
||||
eq_str = (
|
||||
f"Angle = {a:.4f} "
|
||||
f"{'+' if b>=0 else '-'} {abs(b):.5f} * abs(Lat) "
|
||||
f"{'+' if c>=0 else '-'} {abs(c):.5f} * abs(Lat) * cos(2π * (DayOfYear - {d:.1f}) / 365)"
|
||||
)
|
||||
print(f"\n✨ THE CLEAN FORMULA ✨\n{eq_str}\n")
|
||||
|
||||
def main():
|
||||
base_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
fajr_path = os.path.join(base_dir, "data", "processed", "fajr_angles.csv")
|
||||
isha_path = os.path.join(base_dir, "data", "processed", "isha_angles.csv")
|
||||
|
||||
# 1. Clean outliers aggressively to find the mathematically purest underlying signal
|
||||
# We will filter out angles < 10 and > 22 for absolute pure signal matching
|
||||
if os.path.exists(fajr_path):
|
||||
df = pd.read_csv(fajr_path)
|
||||
df_clean = df[(df['fajr_angle'] >= 10.0) & (df['fajr_angle'] <= 20.0)]
|
||||
analyze_formula(df_clean, "fajr")
|
||||
|
||||
if os.path.exists(isha_path):
|
||||
df = pd.read_csv(isha_path)
|
||||
df_clean = df[(df['isha_angle'] >= 11.0) & (df['isha_angle'] <= 21.0)]
|
||||
analyze_formula(df_clean, "isha")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
1
src/analyze/praytimes.py
Normal file
1
src/analyze/praytimes.py
Normal file
|
|
@ -0,0 +1 @@
|
|||
404: Not Found
|
||||
102
src/analyze/run_ml_comparison.py
Normal file
102
src/analyze/run_ml_comparison.py
Normal file
|
|
@ -0,0 +1,102 @@
|
|||
import pandas as pd
|
||||
import numpy as np
|
||||
import warnings
|
||||
from sklearn.linear_model import LinearRegression
|
||||
from sklearn.metrics import mean_absolute_error, r2_score
|
||||
from scipy.optimize import curve_fit
|
||||
|
||||
warnings.filterwarnings("ignore")
|
||||
|
||||
def exp_func(x, a, b, c):
|
||||
return a * np.exp(b * x) + c
|
||||
|
||||
def eval_algorithms(df, name):
|
||||
print(f"\n==========================================")
|
||||
print(f" {name.upper()} ML PATTERN ANALYSIS ")
|
||||
print(f"==========================================")
|
||||
print(f"Total verified observations: {len(df)}")
|
||||
|
||||
# Check column name for angle
|
||||
angle_col = f"{name.lower()}_angle"
|
||||
if angle_col not in df.columns:
|
||||
print(f"Error: {angle_col} not found in columns")
|
||||
return
|
||||
|
||||
df['doy'] = df['day_of_year']
|
||||
df['lat_abs'] = df['lat'].abs()
|
||||
|
||||
# Missing value cleaning just in case
|
||||
df = df.dropna(subset=[angle_col, 'doy', 'lat_abs'])
|
||||
|
||||
# 1. Seasonality Feature Engineering (Sin/Cos mapping of Time of Year)
|
||||
df['doy_sin'] = np.sin(2 * np.pi * df['doy'] / 365)
|
||||
df['doy_cos'] = np.cos(2 * np.pi * df['doy'] / 365)
|
||||
|
||||
X_lat = df[['lat_abs']].values
|
||||
X_all = df[['lat_abs', 'doy_sin', 'doy_cos']].values
|
||||
y = df[angle_col].values
|
||||
|
||||
# --- A. BASELINE FIXED ALGORITHM (ISNA/MWL style: 15° or 18°) ---
|
||||
print("\n[A] STATIC ALGORITHM BENCHMARKS (15° vs 18°)")
|
||||
mae_15 = mean_absolute_error(y, np.full_like(y, 15.0))
|
||||
mae_18 = mean_absolute_error(y, np.full_like(y, 18.0))
|
||||
print(f" Fixed 15° Error (MAE): {mae_15:.3f}°")
|
||||
print(f" Fixed 18° Error (MAE): {mae_18:.3f}°")
|
||||
|
||||
# --- B. LATITUDE ONLY (Linear vs Exponential) ---
|
||||
print("\n[B] LATITUDE PATTERNS (Equator distance vs Angle)")
|
||||
|
||||
# Linear Latitude
|
||||
lr_lat = LinearRegression().fit(X_lat, y)
|
||||
y_pred_lat_lin = lr_lat.predict(X_lat)
|
||||
print(f" Linear Latitude Fit : R2={r2_score(y, y_pred_lat_lin):.3f}, MAE={mean_absolute_error(y, y_pred_lat_lin):.3f}°")
|
||||
print(f" -> Coefficient: {lr_lat.coef_[0]:.4f} degrees per 1° latitude move")
|
||||
|
||||
# Exponential Latitude
|
||||
try:
|
||||
# We model the depression angle dropping exponentially toward the poles
|
||||
popt, _ = curve_fit(exp_func, df['lat_abs'], y, p0=[-1, 0.05, 18], maxfev=5000)
|
||||
y_pred_lat_exp = exp_func(df['lat_abs'], *popt)
|
||||
r2_exp = r2_score(y, y_pred_lat_exp)
|
||||
mae_exp = mean_absolute_error(y, y_pred_lat_exp)
|
||||
print(f" Exponential Latitude Fit : R2={r2_exp:.3f}, MAE={mae_exp:.3f}°")
|
||||
except Exception as e:
|
||||
print(f" Exponential Fit failed: {e}")
|
||||
|
||||
# --- C. TIME OF YEAR (SEASONALITY) IMPACT ---
|
||||
print("\n[C] TIME OF YEAR (SEASONAL) CORRELATIONS")
|
||||
lr_all = LinearRegression().fit(X_all, y)
|
||||
y_pred_all = lr_all.predict(X_all)
|
||||
print(f" Combined Model (Lat + Seasonality): R2={r2_score(y, y_pred_all):.3f}, MAE={mean_absolute_error(y, y_pred_all):.3f}°")
|
||||
|
||||
# Feature Importance
|
||||
print(f" -> Lat Importance : {abs(lr_all.coef_[0]):.4f}")
|
||||
print(f" -> TOY (Season) Sine: {abs(lr_all.coef_[1]):.4f} amplitude")
|
||||
|
||||
# --- D. DYNAMIC ALGORITHM COMPARISON ---
|
||||
high_lat_mask = df['lat_abs'] > 40
|
||||
if sum(high_lat_mask) > 0:
|
||||
high_lat_mean = df[high_lat_mask][angle_col].mean()
|
||||
low_lat_mean = df[~high_lat_mask][angle_col].mean()
|
||||
print(f"\n[D] ALGORITHMIC COMPARISON")
|
||||
print(f" Mean empirical angle at >40° Lat : {high_lat_mean:.2f}°")
|
||||
print(f" Mean empirical angle at <=40° Lat : {low_lat_mean:.2f}°")
|
||||
shift = low_lat_mean - high_lat_mean
|
||||
print(f" Shift detected: {shift:.2f}° absolute reduction when moving towards poles.")
|
||||
if shift > 0.5:
|
||||
print(" => Matches Moonsighting/DPC dynamic logic (angles drop significantly at higher latitudes).")
|
||||
else:
|
||||
print(" => Does NOT strongly match Moonsighting dynamic logic.")
|
||||
|
||||
if __name__ == '__main__':
|
||||
try:
|
||||
fajr_df = pd.read_csv('data/processed/fajr_angles.csv')
|
||||
eval_algorithms(fajr_df, 'Fajr')
|
||||
except Exception as e:
|
||||
print("Missing Fajr dataset:", e)
|
||||
|
||||
try:
|
||||
isha_df = pd.read_csv('data/processed/isha_angles.csv')
|
||||
eval_algorithms(isha_df, 'Isha')
|
||||
except Exception as e:
|
||||
print("Missing Isha dataset:", e)
|
||||
671
src/bsrn_download.py
Normal file
671
src/bsrn_download.py
Normal file
|
|
@ -0,0 +1,671 @@
|
|||
"""
|
||||
BSRN Data Downloader and Twilight Extractor
|
||||
|
||||
Downloads BSRN irradiance data from:
|
||||
1. Zenodo 5-site dataset (quick, 58.5 MB parquet)
|
||||
2. CAELUS Zenodo dataset (54 stations, prioritizing 20-45°N)
|
||||
|
||||
For each station-year, extracts twilight events:
|
||||
- Dawn: moment GHI first exceeds threshold after midnight
|
||||
- Dusk: last moment GHI exceeds threshold before midnight
|
||||
|
||||
Computes solar depression angle at each event.
|
||||
Outputs CSV rows suitable for pray-calc fajr/isha angle analysis.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import math
|
||||
import zipfile
|
||||
import requests
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from datetime import datetime, timezone, date, timedelta
|
||||
from pathlib import Path
|
||||
from io import BytesIO
|
||||
|
||||
# ── paths ────────────────────────────────────────────────────────────────────
|
||||
BASE = Path("/Volumes/X9/Sites/acamarata/pray-calc-ml")
|
||||
RAW_BSRN = BASE / "data/raw/bsrn"
|
||||
RAW_SIGHTINGS = BASE / "data/raw/raw_sightings"
|
||||
RAW_BSRN.mkdir(parents=True, exist_ok=True)
|
||||
RAW_SIGHTINGS.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# ── thresholds ────────────────────────────────────────────────────────────────
|
||||
# GHI threshold in W/m² to call "detectable irradiance"
|
||||
# 0.5 W/m² ~ starlight. 1 W/m² is typical first-light threshold.
|
||||
# We extract events at multiple thresholds.
|
||||
THRESHOLDS = [0.5, 1.0, 2.0, 5.0, 10.0]
|
||||
PRIMARY_THRESHOLD = 1.0 # used for the main sightings CSV
|
||||
|
||||
# ── station priority list (CAELUS dataset) ────────────────────────────────────
|
||||
# Prioritize stations in 20-45°N, especially Muslim-majority/relevant regions
|
||||
PRIORITY_STATIONS = {
|
||||
"sbo": {"name": "Sede Boqer", "lat": 30.8597, "lon": 34.7794, "country": "Israel/Negev"},
|
||||
"tam": {"name": "Tamanrasset", "lat": 22.7903, "lon": 5.5292, "country": "Algeria"},
|
||||
"xia": {"name": "Xianghe", "lat": 39.754, "lon": 116.962, "country": "China"},
|
||||
"gur": {"name": "Gurgaon", "lat": 28.4249, "lon": 77.156, "country": "India"},
|
||||
"iza": {"name": "Izana", "lat": 28.3093, "lon": -16.4993, "country": "Spain/Canary"},
|
||||
"ish": {"name": "Ishigakijima", "lat": 24.3367, "lon": 124.1644, "country": "Japan"},
|
||||
"lln": {"name": "Lulin", "lat": 23.4686, "lon": 120.8736, "country": "Taiwan"},
|
||||
"dra": {"name": "Desert Rock", "lat": 36.626, "lon": -116.018, "country": "USA/Nevada"},
|
||||
"e13": {"name": "SouthernGreatPlains","lat": 36.605, "lon": -97.485, "country": "USA/Oklahoma"},
|
||||
"tat": {"name": "Tateno", "lat": 36.0581, "lon": 140.1258, "country": "Japan"},
|
||||
"fua": {"name": "Fukuoka", "lat": 33.5822, "lon": 130.3764, "country": "Japan"},
|
||||
"gcr": {"name": "GoodwinCreek", "lat": 34.2547, "lon": -89.8729, "country": "USA/Mississippi"},
|
||||
"bil": {"name": "Billings", "lat": 36.605, "lon": -97.516, "country": "USA/Oklahoma"},
|
||||
}
|
||||
|
||||
# Years available in CAELUS per station (approximate; we'll try 2007-2017)
|
||||
CAELUS_YEAR_RANGE = list(range(2007, 2018))
|
||||
|
||||
# ── solar geometry ────────────────────────────────────────────────────────────
|
||||
|
||||
def solar_declination(day_of_year: int) -> float:
|
||||
"""Solar declination in degrees (Spencer 1971)."""
|
||||
B = 2 * math.pi * (day_of_year - 1) / 365
|
||||
return (180 / math.pi) * (
|
||||
0.006918
|
||||
- 0.399912 * math.cos(B)
|
||||
+ 0.070257 * math.sin(B)
|
||||
- 0.006758 * math.cos(2 * B)
|
||||
+ 0.000907 * math.sin(2 * B)
|
||||
- 0.002697 * math.cos(3 * B)
|
||||
+ 0.00148 * math.sin(3 * B)
|
||||
)
|
||||
|
||||
|
||||
def equation_of_time_minutes(day_of_year: int) -> float:
|
||||
"""Equation of time in minutes (Spencer 1971)."""
|
||||
B = 2 * math.pi * (day_of_year - 1) / 365
|
||||
return (180 / math.pi) * (
|
||||
0.000075
|
||||
+ 0.001868 * math.cos(B)
|
||||
- 0.032077 * math.sin(B)
|
||||
- 0.014615 * math.cos(2 * B)
|
||||
- 0.04089 * math.sin(2 * B)
|
||||
) * (180 / 15) # convert to minutes
|
||||
|
||||
|
||||
def solar_hour_angle(utc_hour_frac: float, lon_deg: float, day_of_year: int) -> float:
|
||||
"""Solar hour angle in degrees at given UTC fractional hour and longitude."""
|
||||
eot = equation_of_time_minutes(day_of_year)
|
||||
# Local solar time in hours
|
||||
lst = utc_hour_frac + lon_deg / 15.0 + eot / 60.0
|
||||
# Hour angle (0 at solar noon)
|
||||
return 15.0 * (lst - 12.0)
|
||||
|
||||
|
||||
def solar_elevation(lat_deg: float, lon_deg: float, utc_dt: datetime) -> float:
|
||||
"""Solar elevation angle in degrees (positive above horizon)."""
|
||||
doy = utc_dt.timetuple().tm_yday
|
||||
utc_frac = utc_dt.hour + utc_dt.minute / 60.0 + utc_dt.second / 3600.0
|
||||
decl = math.radians(solar_declination(doy))
|
||||
lat = math.radians(lat_deg)
|
||||
ha = math.radians(solar_hour_angle(utc_frac, lon_deg, doy))
|
||||
sin_elev = (
|
||||
math.sin(lat) * math.sin(decl)
|
||||
+ math.cos(lat) * math.cos(decl) * math.cos(ha)
|
||||
)
|
||||
sin_elev = max(-1.0, min(1.0, sin_elev))
|
||||
return math.degrees(math.asin(sin_elev))
|
||||
|
||||
|
||||
def solar_depression_angle(lat_deg: float, lon_deg: float, utc_dt: datetime) -> float:
|
||||
"""Solar depression angle = negative elevation (positive below horizon)."""
|
||||
return -solar_elevation(lat_deg, lon_deg, utc_dt)
|
||||
|
||||
|
||||
# ── download helpers ──────────────────────────────────────────────────────────
|
||||
|
||||
def download_file(url: str, dest: Path, label: str = "") -> bool:
|
||||
"""Download a file with progress. Returns True on success."""
|
||||
if dest.exists() and dest.stat().st_size > 1000:
|
||||
print(f" [skip] {label or dest.name} already downloaded")
|
||||
return True
|
||||
try:
|
||||
print(f" [download] {label or dest.name} ...", end=" ", flush=True)
|
||||
r = requests.get(url, timeout=120, stream=True)
|
||||
r.raise_for_status()
|
||||
total = int(r.headers.get("content-length", 0))
|
||||
downloaded = 0
|
||||
with open(dest, "wb") as f:
|
||||
for chunk in r.iter_content(chunk_size=65536):
|
||||
f.write(chunk)
|
||||
downloaded += len(chunk)
|
||||
mb = downloaded / 1_048_576
|
||||
print(f"OK ({mb:.1f} MB)")
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"FAILED: {e}")
|
||||
if dest.exists():
|
||||
dest.unlink()
|
||||
return False
|
||||
|
||||
|
||||
# ── twilight extraction ───────────────────────────────────────────────────────
|
||||
|
||||
def extract_twilight_events(
|
||||
df: pd.DataFrame,
|
||||
lat: float,
|
||||
lon: float,
|
||||
station_code: str,
|
||||
station_name: str,
|
||||
source: str,
|
||||
threshold: float = PRIMARY_THRESHOLD,
|
||||
) -> list[dict]:
|
||||
"""
|
||||
Given a dataframe with columns [timestamp, ghi] (UTC timestamps),
|
||||
extract per-day dawn and dusk events.
|
||||
|
||||
Dawn: first minute where GHI >= threshold after a run of zeros
|
||||
Dusk: last minute where GHI >= threshold before a run of zeros
|
||||
|
||||
Returns a list of dicts with keys:
|
||||
date, event_type, utc_time, ghi_wm2, solar_depression_deg,
|
||||
lat, lon, station_code, station_name, source, threshold_wm2
|
||||
"""
|
||||
events = []
|
||||
|
||||
# Ensure UTC
|
||||
if df["timestamp"].dt.tz is None:
|
||||
df = df.copy()
|
||||
df["timestamp"] = df["timestamp"].dt.tz_localize("UTC")
|
||||
else:
|
||||
df = df.copy()
|
||||
df["timestamp"] = df["timestamp"].dt.tz_convert("UTC")
|
||||
|
||||
# Sort and drop nulls
|
||||
df = df.dropna(subset=["ghi"]).sort_values("timestamp").reset_index(drop=True)
|
||||
df["ghi"] = pd.to_numeric(df["ghi"], errors="coerce")
|
||||
df = df.dropna(subset=["ghi"])
|
||||
|
||||
# Group by UTC date (using the date of the timestamp)
|
||||
df["date"] = df["timestamp"].dt.date
|
||||
|
||||
for day, group in df.groupby("date"):
|
||||
group = group.sort_values("timestamp").reset_index(drop=True)
|
||||
ghi = group["ghi"].values
|
||||
ts = group["timestamp"].values # numpy datetime64
|
||||
|
||||
# Find dawn: first index where GHI crosses threshold from below
|
||||
# We require at least 3 consecutive sub-threshold readings before the crossing
|
||||
dawn_idx = None
|
||||
for i in range(3, len(ghi)):
|
||||
if ghi[i] >= threshold and all(g < threshold for g in ghi[max(0, i-3):i]):
|
||||
dawn_idx = i
|
||||
break
|
||||
|
||||
# Find dusk: last index where GHI is >= threshold followed by sub-threshold
|
||||
dusk_idx = None
|
||||
for i in range(len(ghi) - 4, -1, -1):
|
||||
if ghi[i] >= threshold and all(g < threshold for g in ghi[i+1:min(len(ghi), i+4)]):
|
||||
dusk_idx = i
|
||||
break
|
||||
|
||||
for idx, event_type in [(dawn_idx, "dawn"), (dusk_idx, "dusk")]:
|
||||
if idx is None:
|
||||
continue
|
||||
ts_val = pd.Timestamp(ts[idx])
|
||||
utc_dt = ts_val.to_pydatetime().replace(tzinfo=timezone.utc)
|
||||
dep = solar_depression_angle(lat, lon, utc_dt)
|
||||
ghi_val = float(ghi[idx])
|
||||
|
||||
# Only record events where sun is clearly below horizon
|
||||
# (eliminates cloudy days where GHI is suppressed during daytime)
|
||||
if dep < 0: # sun above horizon — skip, not a real twilight event
|
||||
continue
|
||||
if dep > 25: # sun more than 25° below — likely data noise
|
||||
continue
|
||||
|
||||
events.append({
|
||||
"date": str(day),
|
||||
"event_type": event_type,
|
||||
"utc_time": utc_dt.strftime("%H:%M"),
|
||||
"ghi_wm2": round(ghi_val, 2),
|
||||
"solar_depression_deg": round(dep, 4),
|
||||
"lat": lat,
|
||||
"lon": lon,
|
||||
"station_code": station_code,
|
||||
"station_name": station_name,
|
||||
"source": source,
|
||||
"threshold_wm2": threshold,
|
||||
})
|
||||
|
||||
return events
|
||||
|
||||
|
||||
# ── dataset 1: BSRN 5-site parquet ───────────────────────────────────────────
|
||||
|
||||
FIVE_SITE_URL = "https://zenodo.org/records/10593079/files/bsrn_valid_five_sites.parquet"
|
||||
FIVE_SITE_PATH = RAW_BSRN / "bsrn_valid_five_sites.parquet"
|
||||
|
||||
FIVE_SITE_META = {
|
||||
"mnm": {"name": "Minamitorishima", "lat": 24.2833, "lon": 153.9667},
|
||||
"asp": {"name": "Alice Springs", "lat": -23.7984, "lon": 133.8880},
|
||||
"car": {"name": "Carpentras", "lat": 44.0833, "lon": 5.0583},
|
||||
"bon": {"name": "Bondville", "lat": 40.0667, "lon": -88.3667},
|
||||
"son": {"name": "Sonnblick", "lat": 47.0542, "lon": 12.9578},
|
||||
}
|
||||
|
||||
|
||||
def process_five_site() -> list[dict]:
|
||||
"""Download and process the BSRN 5-site parquet dataset."""
|
||||
print("\n=== BSRN 5-site dataset (Zenodo 10593079) ===")
|
||||
ok = download_file(FIVE_SITE_URL, FIVE_SITE_PATH, "bsrn_valid_five_sites.parquet")
|
||||
if not ok:
|
||||
print(" ERROR: could not download 5-site dataset")
|
||||
return []
|
||||
|
||||
print(" Loading parquet ...", end=" ", flush=True)
|
||||
df_all = pd.read_parquet(FIVE_SITE_PATH)
|
||||
print(f"OK — {len(df_all):,} rows, columns: {list(df_all.columns)}")
|
||||
|
||||
all_events = []
|
||||
|
||||
# Detect timestamp column
|
||||
ts_col = None
|
||||
for c in ["timestamp", "time", "datetime", "date_time", "Timestamp", "TIME"]:
|
||||
if c in df_all.columns:
|
||||
ts_col = c
|
||||
break
|
||||
if ts_col is None:
|
||||
# Use first datetime-like column
|
||||
for c in df_all.columns:
|
||||
if pd.api.types.is_datetime64_any_dtype(df_all[c]):
|
||||
ts_col = c
|
||||
break
|
||||
if ts_col is None:
|
||||
print(f" WARNING: cannot find timestamp column. Columns: {list(df_all.columns)[:20]}")
|
||||
return []
|
||||
|
||||
# Detect GHI column
|
||||
ghi_col = None
|
||||
for c in ["ghi", "GHI", "global_horizontal_irradiance", "SWD", "swd", "global"]:
|
||||
if c in df_all.columns:
|
||||
ghi_col = c
|
||||
break
|
||||
if ghi_col is None:
|
||||
print(f" WARNING: cannot find GHI column. Columns: {list(df_all.columns)[:20]}")
|
||||
return []
|
||||
|
||||
# Detect station column
|
||||
stn_col = None
|
||||
for c in ["station", "sta", "site", "STATION"]:
|
||||
if c in df_all.columns:
|
||||
stn_col = c
|
||||
break
|
||||
|
||||
print(f" Using ts_col='{ts_col}', ghi_col='{ghi_col}', station_col='{stn_col}'")
|
||||
|
||||
if stn_col:
|
||||
groups = {stn: sub for stn, sub in df_all.groupby(stn_col)}
|
||||
else:
|
||||
# Single station file — infer from filename or use first meta entry
|
||||
groups = {"unknown": df_all}
|
||||
|
||||
for stn_code, sub in groups.items():
|
||||
stn_key = str(stn_code).lower()
|
||||
meta = FIVE_SITE_META.get(stn_key, None)
|
||||
if meta is None:
|
||||
# Try to find lat/lon in data
|
||||
if "latitude" in sub.columns and "longitude" in sub.columns:
|
||||
lat = float(sub["latitude"].iloc[0])
|
||||
lon = float(sub["longitude"].iloc[0])
|
||||
name = stn_key
|
||||
else:
|
||||
print(f" SKIP {stn_key}: no metadata")
|
||||
continue
|
||||
else:
|
||||
lat, lon, name = meta["lat"], meta["lon"], meta["name"]
|
||||
|
||||
work = sub[[ts_col, ghi_col]].copy()
|
||||
work.columns = ["timestamp", "ghi"]
|
||||
work["timestamp"] = pd.to_datetime(work["timestamp"], utc=True, errors="coerce")
|
||||
work = work.dropna(subset=["timestamp"])
|
||||
|
||||
print(f" Processing {name} ({stn_key}): {len(work):,} rows ...", end=" ", flush=True)
|
||||
evts = extract_twilight_events(work, lat, lon, stn_key, name, "bsrn_5site")
|
||||
print(f"{len(evts)} events")
|
||||
all_events.extend(evts)
|
||||
|
||||
return all_events
|
||||
|
||||
|
||||
# ── dataset 2: CAELUS ZIP files ───────────────────────────────────────────────
|
||||
|
||||
CAELUS_BASE_URL = "https://zenodo.org/api/records/7897639/files/{filename}/content"
|
||||
|
||||
|
||||
def caelus_filename(station_code: str, year: int) -> str:
|
||||
return f"{station_code}_bsrn_{year}.zip"
|
||||
|
||||
|
||||
def list_caelus_available() -> dict[str, list[int]]:
|
||||
"""
|
||||
Probe Zenodo API to get available files, return dict of station -> [years].
|
||||
"""
|
||||
print("\n Querying CAELUS file list from Zenodo API ...", end=" ", flush=True)
|
||||
try:
|
||||
r = requests.get("https://zenodo.org/api/records/7897639", timeout=30)
|
||||
r.raise_for_status()
|
||||
data = r.json()
|
||||
available: dict[str, list[int]] = {}
|
||||
for f in data.get("files", []):
|
||||
fname = f.get("key", "")
|
||||
if fname.endswith(".zip") and "_bsrn_" in fname:
|
||||
parts = fname.replace(".zip", "").split("_bsrn_")
|
||||
if len(parts) == 2:
|
||||
code = parts[0]
|
||||
try:
|
||||
year = int(parts[1])
|
||||
available.setdefault(code, []).append(year)
|
||||
except ValueError:
|
||||
pass
|
||||
print(f"found {len(available)} stations")
|
||||
return available
|
||||
except Exception as e:
|
||||
print(f"FAILED: {e}")
|
||||
return {}
|
||||
|
||||
|
||||
def process_caelus_zip(zip_path: Path, station_code: str, meta: dict, year: int) -> list[dict]:
|
||||
"""Extract parquet from ZIP, parse, return events."""
|
||||
events = []
|
||||
try:
|
||||
with zipfile.ZipFile(zip_path, "r") as zf:
|
||||
parquet_files = [n for n in zf.namelist() if n.endswith(".parquet")]
|
||||
if not parquet_files:
|
||||
print(f" WARNING: no parquet in {zip_path.name}")
|
||||
return []
|
||||
for pf in parquet_files:
|
||||
with zf.open(pf) as f:
|
||||
df = pd.read_parquet(BytesIO(f.read()))
|
||||
|
||||
# Detect columns
|
||||
ts_col = None
|
||||
for c in ["timestamp", "time", "datetime", "Timestamp", "TIME"]:
|
||||
if c in df.columns:
|
||||
ts_col = c
|
||||
break
|
||||
if ts_col is None:
|
||||
for c in df.columns:
|
||||
if pd.api.types.is_datetime64_any_dtype(df[c]):
|
||||
ts_col = c
|
||||
break
|
||||
if ts_col is None:
|
||||
print(f" WARNING: no timestamp in {pf}")
|
||||
continue
|
||||
|
||||
ghi_col = None
|
||||
for c in ["ghi", "GHI", "SWD", "swd", "global_horizontal_irradiance", "global"]:
|
||||
if c in df.columns:
|
||||
ghi_col = c
|
||||
break
|
||||
if ghi_col is None:
|
||||
# Use diffuse if available
|
||||
for c in ["dhi", "DHI", "diffuse"]:
|
||||
if c in df.columns:
|
||||
ghi_col = c
|
||||
break
|
||||
if ghi_col is None:
|
||||
print(f" WARNING: no GHI column in {pf}. cols={list(df.columns)[:15]}")
|
||||
continue
|
||||
|
||||
work = df[[ts_col, ghi_col]].copy()
|
||||
work.columns = ["timestamp", "ghi"]
|
||||
work["timestamp"] = pd.to_datetime(work["timestamp"], utc=True, errors="coerce")
|
||||
work = work.dropna(subset=["timestamp"])
|
||||
|
||||
evts = extract_twilight_events(
|
||||
work,
|
||||
meta["lat"], meta["lon"],
|
||||
station_code, meta["name"],
|
||||
f"caelus_{year}",
|
||||
)
|
||||
events.extend(evts)
|
||||
|
||||
except zipfile.BadZipFile as e:
|
||||
print(f" ERROR: bad zip {zip_path.name}: {e}")
|
||||
except Exception as e:
|
||||
print(f" ERROR processing {zip_path.name}: {e}")
|
||||
|
||||
return events
|
||||
|
||||
|
||||
def process_caelus(max_stations: int = 8, max_years_per_station: int = 5) -> list[dict]:
|
||||
"""Download and process CAELUS priority stations."""
|
||||
print("\n=== CAELUS dataset (Zenodo 7897639) ===")
|
||||
available = list_caelus_available()
|
||||
|
||||
all_events = []
|
||||
|
||||
processed_count = 0
|
||||
for code, meta in PRIORITY_STATIONS.items():
|
||||
if processed_count >= max_stations:
|
||||
break
|
||||
|
||||
if code not in available:
|
||||
print(f" SKIP {code} ({meta['name']}): not in CAELUS dataset")
|
||||
continue
|
||||
|
||||
years = sorted(available[code])
|
||||
years_to_use = years[:max_years_per_station]
|
||||
print(f"\n Station: {meta['name']} ({code}) | lat={meta['lat']} lon={meta['lon']} | years: {years_to_use}")
|
||||
|
||||
station_events = []
|
||||
for year in years_to_use:
|
||||
fname = caelus_filename(code, year)
|
||||
url = CAELUS_BASE_URL.format(filename=fname)
|
||||
dest = RAW_BSRN / fname
|
||||
|
||||
ok = download_file(url, dest, fname)
|
||||
if not ok:
|
||||
continue
|
||||
|
||||
print(f" Processing {fname} ...", end=" ", flush=True)
|
||||
evts = process_caelus_zip(dest, code, meta, year)
|
||||
print(f"{len(evts)} events")
|
||||
station_events.extend(evts)
|
||||
|
||||
print(f" Total events for {code}: {len(station_events)}")
|
||||
all_events.extend(station_events)
|
||||
processed_count += 1
|
||||
|
||||
return all_events
|
||||
|
||||
|
||||
# ── SURFRAD download ──────────────────────────────────────────────────────────
|
||||
|
||||
SURFRAD_STATIONS = {
|
||||
"dra": {
|
||||
"name": "Desert Rock",
|
||||
"lat": 36.626, "lon": -116.018,
|
||||
"ftp_dir": "DRA",
|
||||
},
|
||||
"tbl": {
|
||||
"name": "Table Mountain",
|
||||
"lat": 40.125, "lon": -105.237,
|
||||
"ftp_dir": "TBL",
|
||||
},
|
||||
}
|
||||
SURFRAD_BASE = "https://gml.noaa.gov/aftp/data/radiation/surfrad"
|
||||
|
||||
|
||||
def parse_surfrad_file(content: bytes, station_meta: dict) -> pd.DataFrame:
|
||||
"""
|
||||
Parse SURFRAD dat file format.
|
||||
Returns dataframe with [timestamp, ghi] columns.
|
||||
"""
|
||||
lines = content.decode("utf-8", errors="replace").splitlines()
|
||||
|
||||
# Skip header (first line is site info, second is column header)
|
||||
# Format: year jday hour min dt zen dw_solar dw_solar_stdev ...
|
||||
records = []
|
||||
for line in lines[2:]:
|
||||
parts = line.split()
|
||||
if len(parts) < 9:
|
||||
continue
|
||||
try:
|
||||
yr = int(parts[0])
|
||||
jday = int(parts[1])
|
||||
hr = int(parts[2])
|
||||
mn = int(parts[3])
|
||||
# GHI is column index 6 (dw_solar = downwelling shortwave)
|
||||
ghi_raw = float(parts[6])
|
||||
|
||||
# Build datetime
|
||||
dt = datetime(yr, 1, 1, hr, mn, tzinfo=timezone.utc) + timedelta(days=jday - 1)
|
||||
|
||||
# SURFRAD uses -9999 for missing
|
||||
ghi = None if ghi_raw < -900 else max(0.0, ghi_raw)
|
||||
records.append({"timestamp": dt, "ghi": ghi})
|
||||
except (ValueError, IndexError):
|
||||
continue
|
||||
|
||||
return pd.DataFrame(records)
|
||||
|
||||
|
||||
def process_surfrad(years: list[int] | None = None) -> list[dict]:
|
||||
"""Download and process SURFRAD Desert Rock and Table Mountain."""
|
||||
print("\n=== SURFRAD dataset (NOAA GML FTP) ===")
|
||||
if years is None:
|
||||
years = [2018, 2019, 2020] # 3 recent years
|
||||
|
||||
all_events = []
|
||||
|
||||
for code, meta in SURFRAD_STATIONS.items():
|
||||
station_dir = meta["ftp_dir"]
|
||||
print(f"\n Station: {meta['name']} ({code}) | lat={meta['lat']} lon={meta['lon']}")
|
||||
|
||||
for year in years:
|
||||
print(f" Year {year}:")
|
||||
station_events = []
|
||||
|
||||
# SURFRAD stores one file per day: DRA/YYYY/draYYYjjj.dat
|
||||
# where jjj = day of year (001-365)
|
||||
# We'll try to get the full year index
|
||||
year_dir_url = f"{SURFRAD_BASE}/{station_dir}/{year}/"
|
||||
try:
|
||||
r = requests.get(year_dir_url, timeout=30)
|
||||
if r.status_code != 200:
|
||||
print(f" Cannot access {year_dir_url}: {r.status_code}")
|
||||
continue
|
||||
# Parse file listing (Apache directory index)
|
||||
import re
|
||||
files = re.findall(rf'{code.lower()}\d{{5}}\.dat', r.text)
|
||||
files = sorted(set(files))
|
||||
print(f" Found {len(files)} daily files")
|
||||
|
||||
dest_dir = RAW_BSRN / "surfrad" / station_dir / str(year)
|
||||
dest_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
day_records = []
|
||||
for fname in files:
|
||||
dest = dest_dir / fname
|
||||
file_url = f"{year_dir_url}{fname}"
|
||||
ok = download_file(file_url, dest, fname)
|
||||
if not ok:
|
||||
continue
|
||||
with open(dest, "rb") as f:
|
||||
content = f.read()
|
||||
day_df = parse_surfrad_file(content, meta)
|
||||
if not day_df.empty:
|
||||
day_records.append(day_df)
|
||||
|
||||
if day_records:
|
||||
df_year = pd.concat(day_records, ignore_index=True)
|
||||
print(f" {len(df_year):,} 1-min records loaded", end=" ")
|
||||
evts = extract_twilight_events(
|
||||
df_year, meta["lat"], meta["lon"],
|
||||
code, meta["name"], f"surfrad_{year}",
|
||||
)
|
||||
print(f"→ {len(evts)} twilight events")
|
||||
station_events.extend(evts)
|
||||
|
||||
except Exception as e:
|
||||
print(f" ERROR: {e}")
|
||||
continue
|
||||
|
||||
all_events.extend(station_events)
|
||||
|
||||
return all_events
|
||||
|
||||
|
||||
# ── save results ──────────────────────────────────────────────────────────────
|
||||
|
||||
def save_sightings(events: list[dict], filename: str) -> Path:
|
||||
"""Save events list to CSV."""
|
||||
if not events:
|
||||
print(f" WARNING: no events to save to {filename}")
|
||||
return RAW_SIGHTINGS / filename
|
||||
df = pd.DataFrame(events)
|
||||
df = df.sort_values(["station_code", "date", "event_type"]).reset_index(drop=True)
|
||||
out = RAW_SIGHTINGS / filename
|
||||
df.to_csv(out, index=False)
|
||||
print(f" Saved {len(df)} events → {out}")
|
||||
return out
|
||||
|
||||
|
||||
def print_summary(events: list[dict], label: str):
|
||||
if not events:
|
||||
print(f"\n{label}: no events")
|
||||
return
|
||||
df = pd.DataFrame(events)
|
||||
print(f"\n{'='*60}")
|
||||
print(f"SUMMARY: {label}")
|
||||
print(f" Total events: {len(df)}")
|
||||
print(f" Dawn events: {(df['event_type']=='dawn').sum()}")
|
||||
print(f" Dusk events: {(df['event_type']=='dusk').sum()}")
|
||||
print(f" Stations: {df['station_code'].nunique()} — {sorted(df['station_code'].unique())}")
|
||||
print(f" Date range: {df['date'].min()} to {df['date'].max()}")
|
||||
dep = df["solar_depression_deg"]
|
||||
print(f" Depression angle: mean={dep.mean():.2f}° std={dep.std():.2f}° "
|
||||
f"min={dep.min():.2f}° max={dep.max():.2f}°")
|
||||
if "event_type" in df.columns:
|
||||
for evt_type in ["dawn", "dusk"]:
|
||||
sub = df[df["event_type"] == evt_type]
|
||||
if not sub.empty:
|
||||
d = sub["solar_depression_deg"]
|
||||
print(f" {evt_type.upper()} depression: mean={d.mean():.2f}° "
|
||||
f"p10={d.quantile(0.1):.2f}° p90={d.quantile(0.9):.2f}°")
|
||||
print(f"{'='*60}")
|
||||
|
||||
|
||||
# ── main ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
print("BSRN Twilight Extractor")
|
||||
print(f"Output dir: {RAW_SIGHTINGS}")
|
||||
print(f"Raw data: {RAW_BSRN}")
|
||||
print(f"GHI threshold: {PRIMARY_THRESHOLD} W/m²\n")
|
||||
|
||||
all_events = []
|
||||
|
||||
# 1. BSRN 5-site (quick, 58.5 MB)
|
||||
five_site_events = process_five_site()
|
||||
save_sightings(five_site_events, "bsrn_5site_twilight.csv")
|
||||
print_summary(five_site_events, "BSRN 5-site dataset")
|
||||
all_events.extend(five_site_events)
|
||||
|
||||
# 2. CAELUS (priority stations, up to 8 stations × 5 years)
|
||||
caelus_events = process_caelus(max_stations=8, max_years_per_station=5)
|
||||
save_sightings(caelus_events, "bsrn_caelus_twilight.csv")
|
||||
print_summary(caelus_events, "CAELUS dataset")
|
||||
all_events.extend(caelus_events)
|
||||
|
||||
# 3. SURFRAD (Desert Rock + Table Mountain)
|
||||
surfrad_events = process_surfrad(years=[2018, 2019, 2020])
|
||||
save_sightings(surfrad_events, "surfrad_twilight.csv")
|
||||
print_summary(surfrad_events, "SURFRAD dataset")
|
||||
all_events.extend(surfrad_events)
|
||||
|
||||
# Combined
|
||||
save_sightings(all_events, "bsrn_all_twilight.csv")
|
||||
print_summary(all_events, "ALL SOURCES COMBINED")
|
||||
|
||||
print("\nDone.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
563
src/bsrn_process.py
Normal file
563
src/bsrn_process.py
Normal file
|
|
@ -0,0 +1,563 @@
|
|||
"""
|
||||
BSRN Twilight Extractor — corrected for actual file formats.
|
||||
|
||||
Dataset formats:
|
||||
1. BSRN 5-site parquet (Zenodo 10593079):
|
||||
- Multi-index: (times_utc, site)
|
||||
- Columns: climate, longitude, latitude, sza, eth, ghicda, ghics, difcs, ghi, dif, sky_type
|
||||
- 'site' values like 'asp/bsrn', 'bon/bsrn', etc.
|
||||
|
||||
2. CAELUS ZIPs (Zenodo 7897639):
|
||||
- Each ZIP contains one CSV named {code}_bsrn_{year}.csv
|
||||
- Columns: times_utc, sza, eth, ghi, dif, ghics, ghicda, sky_type
|
||||
- sza = solar zenith angle (degrees). Depression = sza - 90 when sza > 90.
|
||||
|
||||
For twilight extraction we use BOTH ghi AND dif (diffuse horizontal irradiance).
|
||||
Diffuse is more sensitive at twilight — it captures scattered skylight before direct beam arrives.
|
||||
|
||||
The `sza` column is authoritative for solar geometry — no need to recompute.
|
||||
Depression angle = sza - 90° (valid when sza > 90°, i.e., sun below horizon).
|
||||
|
||||
Thresholds extracted (multi-threshold approach):
|
||||
- 0.5, 1, 2, 5, 10 W/m² for both ghi and dif
|
||||
|
||||
Primary threshold for the main output: dif >= 1.0 W/m² (diffuse is more reliable at twilight).
|
||||
"""
|
||||
|
||||
import os
|
||||
import math
|
||||
import zipfile
|
||||
import requests
|
||||
import re
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from datetime import datetime, timezone, timedelta
|
||||
from pathlib import Path
|
||||
from io import BytesIO
|
||||
|
||||
# ── paths ─────────────────────────────────────────────────────────────────────
|
||||
BASE = Path("/Volumes/X9/Sites/acamarata/pray-calc-ml")
|
||||
RAW_BSRN = BASE / "data/raw/bsrn"
|
||||
RAW_SIGHTINGS = BASE / "data/raw/raw_sightings"
|
||||
RAW_BSRN.mkdir(parents=True, exist_ok=True)
|
||||
RAW_SIGHTINGS.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# ── thresholds ────────────────────────────────────────────────────────────────
|
||||
THRESHOLDS = [0.5, 1.0, 2.0, 5.0, 10.0]
|
||||
PRIMARY_THRESHOLD = 1.0 # W/m²
|
||||
|
||||
# ── station metadata ──────────────────────────────────────────────────────────
|
||||
# For CAELUS: station codes and their coordinates
|
||||
CAELUS_STATION_META = {
|
||||
"sbo": {"name": "Sede Boqer", "lat": 30.8597, "lon": 34.7794, "region": "Negev/Israel"},
|
||||
"tam": {"name": "Tamanrasset", "lat": 22.7903, "lon": 5.5292, "region": "Algeria"},
|
||||
"xia": {"name": "Xianghe", "lat": 39.754, "lon": 116.962, "region": "China"},
|
||||
"iza": {"name": "Izana", "lat": 28.3093, "lon": -16.4993, "region": "Canary Islands/Spain"},
|
||||
"ish": {"name": "Ishigakijima", "lat": 24.3367, "lon": 124.1644, "region": "Japan"},
|
||||
"dra": {"name": "Desert Rock", "lat": 36.626, "lon": -116.018, "region": "Nevada/USA"},
|
||||
"e13": {"name": "SouthernGreatPlains", "lat": 36.605, "lon": -97.485, "region": "Oklahoma/USA"},
|
||||
"tat": {"name": "Tateno", "lat": 36.0581, "lon": 140.1258, "region": "Japan"},
|
||||
"fua": {"name": "Fukuoka", "lat": 33.5822, "lon": 130.3764, "region": "Japan"},
|
||||
"gcr": {"name": "GoodwinCreek", "lat": 34.2547, "lon": -89.8729, "region": "Mississippi/USA"},
|
||||
"bil": {"name": "Billings/ARM", "lat": 36.605, "lon": -97.516, "region": "Oklahoma/USA"},
|
||||
"abs": {"name": "Abashiri", "lat": 44.0178, "lon": 144.2797, "region": "Japan"},
|
||||
"lyu": {"name": "Lanyu Island", "lat": 22.037, "lon": 121.5583, "region": "Taiwan"},
|
||||
"lln": {"name": "Lulin", "lat": 23.4686, "lon": 120.8736, "region": "Taiwan"},
|
||||
"cnr": {"name": "Cener", "lat": 42.816, "lon": -1.601, "region": "Spain/Navarra"},
|
||||
"flo": {"name": "Florina", "lat": 40.78, "lon": 21.38, "region": "Greece"},
|
||||
"car": {"name": "Carpentras", "lat": 44.0833, "lon": 5.0583, "region": "France"},
|
||||
}
|
||||
|
||||
# Five-site parquet station metadata (lat/lon stored in the file itself)
|
||||
FIVE_SITE_MAP = {
|
||||
"mnm": {"name": "Minamitorishima", "region": "Japan"},
|
||||
"asp": {"name": "Alice Springs", "region": "Australia"},
|
||||
"car": {"name": "Carpentras", "region": "France"},
|
||||
"bon": {"name": "Bondville", "region": "USA/Illinois"},
|
||||
"son": {"name": "Sonnblick", "region": "Austria"},
|
||||
}
|
||||
|
||||
CAELUS_BASE_URL = "https://zenodo.org/api/records/7897639/files/{filename}/content"
|
||||
FIVE_SITE_URL = "https://zenodo.org/records/10593079/files/bsrn_valid_five_sites.parquet"
|
||||
FIVE_SITE_PATH = RAW_BSRN / "bsrn_valid_five_sites.parquet"
|
||||
|
||||
# ── download helper ───────────────────────────────────────────────────────────
|
||||
|
||||
def download_file(url: str, dest: Path, label: str = "") -> bool:
|
||||
if dest.exists() and dest.stat().st_size > 1000:
|
||||
print(f" [cached] {label or dest.name}")
|
||||
return True
|
||||
try:
|
||||
print(f" [dl] {label or dest.name} ...", end=" ", flush=True)
|
||||
r = requests.get(url, timeout=180, stream=True)
|
||||
r.raise_for_status()
|
||||
downloaded = 0
|
||||
with open(dest, "wb") as f:
|
||||
for chunk in r.iter_content(chunk_size=131072):
|
||||
f.write(chunk)
|
||||
downloaded += len(chunk)
|
||||
print(f"OK ({downloaded/1_048_576:.1f} MB)")
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"FAILED: {e}")
|
||||
if dest.exists():
|
||||
dest.unlink()
|
||||
return False
|
||||
|
||||
|
||||
# ── twilight extraction ───────────────────────────────────────────────────────
|
||||
|
||||
def extract_events_from_df(
|
||||
df: pd.DataFrame,
|
||||
lat: float,
|
||||
lon: float,
|
||||
station_code: str,
|
||||
station_name: str,
|
||||
source: str,
|
||||
) -> list[dict]:
|
||||
"""
|
||||
df must have columns: times_utc (datetime64[ns] UTC), sza (float), ghi (float), dif (float).
|
||||
|
||||
For each day:
|
||||
dawn = first minute where dif (or ghi if dif is null) crosses PRIMARY_THRESHOLD
|
||||
AND sza > 90 (sun still below horizon)
|
||||
dusk = last minute where dif (or ghi) >= PRIMARY_THRESHOLD
|
||||
AND sza > 90
|
||||
|
||||
Records depression angle = sza - 90.0 at the event moment.
|
||||
Also records raw threshold-by-threshold data for all THRESHOLDS.
|
||||
"""
|
||||
events = []
|
||||
|
||||
df = df.copy()
|
||||
df["times_utc"] = pd.to_datetime(df["times_utc"], utc=True, errors="coerce")
|
||||
df = df.dropna(subset=["times_utc"]).sort_values("times_utc").reset_index(drop=True)
|
||||
|
||||
# Build irradiance signal: prefer dif (diffuse) which is more sensitive at twilight
|
||||
# Fall back to ghi if dif is all NaN
|
||||
if "dif" in df.columns:
|
||||
df["irr"] = pd.to_numeric(df["dif"], errors="coerce")
|
||||
else:
|
||||
df["irr"] = float("nan")
|
||||
|
||||
if "ghi" in df.columns:
|
||||
ghi_vals = pd.to_numeric(df["ghi"], errors="coerce")
|
||||
# Fill irr NaN with ghi
|
||||
df["irr"] = df["irr"].combine_first(ghi_vals)
|
||||
|
||||
df["sza"] = pd.to_numeric(df["sza"], errors="coerce")
|
||||
df["date"] = df["times_utc"].dt.date
|
||||
|
||||
for day, grp in df.groupby("date"):
|
||||
grp = grp.sort_values("times_utc").reset_index(drop=True)
|
||||
|
||||
irr = grp["irr"].values
|
||||
sza = grp["sza"].values
|
||||
ts = grp["times_utc"].values
|
||||
|
||||
n = len(irr)
|
||||
if n < 10:
|
||||
continue
|
||||
|
||||
# For each threshold, extract dawn/dusk
|
||||
for threshold in THRESHOLDS:
|
||||
# Dawn: first index where irr >= threshold AND sza > 90
|
||||
# Require at least 2 consecutive sub-threshold readings immediately before
|
||||
dawn_idx = None
|
||||
for i in range(2, n):
|
||||
if (
|
||||
not np.isnan(irr[i])
|
||||
and irr[i] >= threshold
|
||||
and not np.isnan(sza[i])
|
||||
and sza[i] > 90.0
|
||||
):
|
||||
# Confirm at least 2 prior readings were sub-threshold
|
||||
prev_ok = all(
|
||||
(np.isnan(v) or v < threshold)
|
||||
for v in irr[max(0, i-3):i]
|
||||
)
|
||||
if prev_ok:
|
||||
dawn_idx = i
|
||||
break
|
||||
|
||||
# Dusk: last index where irr >= threshold AND sza > 90
|
||||
# Require at least 2 consecutive sub-threshold readings immediately after
|
||||
dusk_idx = None
|
||||
for i in range(n - 3, -1, -1):
|
||||
if (
|
||||
not np.isnan(irr[i])
|
||||
and irr[i] >= threshold
|
||||
and not np.isnan(sza[i])
|
||||
and sza[i] > 90.0
|
||||
):
|
||||
after_ok = all(
|
||||
(np.isnan(v) or v < threshold)
|
||||
for v in irr[i+1:min(n, i+4)]
|
||||
)
|
||||
if after_ok:
|
||||
dusk_idx = i
|
||||
break
|
||||
|
||||
for idx, event_type in [(dawn_idx, "dawn"), (dusk_idx, "dusk")]:
|
||||
if idx is None:
|
||||
continue
|
||||
|
||||
sza_val = float(sza[idx])
|
||||
# sza must be > 90 (sun below horizon) — extra safety check
|
||||
if sza_val <= 90.0:
|
||||
continue
|
||||
# Depression angle in degrees
|
||||
dep = round(sza_val - 90.0, 4)
|
||||
# Only record plausible twilight depression (0.5° to 20°)
|
||||
if dep < 0.3 or dep > 20.0:
|
||||
continue
|
||||
|
||||
utc_ts = pd.Timestamp(ts[idx])
|
||||
irr_val = float(irr[idx])
|
||||
|
||||
# Also grab ghi/dif individually at this moment
|
||||
ghi_at = float(grp["ghi"].iloc[idx]) if "ghi" in grp.columns else float("nan")
|
||||
dif_at = float(grp["dif"].iloc[idx]) if "dif" in grp.columns else float("nan")
|
||||
|
||||
events.append({
|
||||
"date": str(day),
|
||||
"event_type": event_type,
|
||||
"utc_time": utc_ts.strftime("%H:%M:%S"),
|
||||
"sza_deg": round(sza_val, 4),
|
||||
"solar_depression_deg": dep,
|
||||
"irr_wm2": round(irr_val, 2),
|
||||
"ghi_wm2": round(ghi_at, 2) if not math.isnan(ghi_at) else None,
|
||||
"dif_wm2": round(dif_at, 2) if not math.isnan(dif_at) else None,
|
||||
"threshold_wm2": threshold,
|
||||
"lat": lat,
|
||||
"lon": lon,
|
||||
"station_code": station_code,
|
||||
"station_name": station_name,
|
||||
"source": source,
|
||||
})
|
||||
|
||||
return events
|
||||
|
||||
|
||||
# ── 5-site dataset ────────────────────────────────────────────────────────────
|
||||
|
||||
def process_five_site() -> list[dict]:
|
||||
print("\n=== BSRN 5-site parquet (Zenodo 10593079) ===")
|
||||
ok = download_file(FIVE_SITE_URL, FIVE_SITE_PATH, "bsrn_valid_five_sites.parquet")
|
||||
if not ok:
|
||||
return []
|
||||
|
||||
print(" Loading ...", end=" ", flush=True)
|
||||
df_raw = pd.read_parquet(FIVE_SITE_PATH)
|
||||
df_raw = df_raw.reset_index() # brings times_utc and site into columns
|
||||
print(f"OK — {len(df_raw):,} rows")
|
||||
print(f" Columns: {list(df_raw.columns)}")
|
||||
|
||||
all_events = []
|
||||
for site_val, grp in df_raw.groupby("site"):
|
||||
# site_val is like 'asp/bsrn'
|
||||
code = str(site_val).split("/")[0].lower()
|
||||
meta = FIVE_SITE_MAP.get(code, {"name": code, "region": "unknown"})
|
||||
|
||||
# Get lat/lon from the data itself
|
||||
lat = float(grp["latitude"].iloc[0])
|
||||
lon = float(grp["longitude"].iloc[0])
|
||||
|
||||
# Rename columns to standard
|
||||
sub = grp[["times_utc", "sza", "ghi", "dif"]].copy()
|
||||
|
||||
print(f" {meta['name']} ({code}) lat={lat:.3f} lon={lon:.3f}: "
|
||||
f"{len(sub):,} rows ...", end=" ", flush=True)
|
||||
|
||||
evts = extract_events_from_df(sub, lat, lon, code, meta["name"], "bsrn_5site")
|
||||
print(f"{len(evts)} events")
|
||||
all_events.extend(evts)
|
||||
|
||||
return all_events
|
||||
|
||||
|
||||
# ── CAELUS dataset ─────────────────────────────────────────────────────────────
|
||||
|
||||
def list_caelus_files() -> dict[str, list[int]]:
|
||||
"""Return dict of station_code -> sorted list of available years."""
|
||||
print(" Querying Zenodo API ...", end=" ", flush=True)
|
||||
try:
|
||||
r = requests.get("https://zenodo.org/api/records/7897639", timeout=30)
|
||||
r.raise_for_status()
|
||||
data = r.json()
|
||||
available: dict[str, list[int]] = {}
|
||||
for f in data.get("files", []):
|
||||
fname = f.get("key", "")
|
||||
if fname.endswith(".zip") and "_bsrn_" in fname:
|
||||
parts = fname.replace(".zip", "").split("_bsrn_")
|
||||
if len(parts) == 2:
|
||||
code = parts[0]
|
||||
try:
|
||||
year = int(parts[1])
|
||||
available.setdefault(code, []).append(year)
|
||||
except ValueError:
|
||||
pass
|
||||
for k in available:
|
||||
available[k] = sorted(available[k])
|
||||
print(f"{len(available)} stations found")
|
||||
return available
|
||||
except Exception as e:
|
||||
print(f"FAILED: {e}")
|
||||
return {}
|
||||
|
||||
|
||||
def process_caelus_zip(zip_path: Path, station_code: str, meta: dict, year: int) -> list[dict]:
|
||||
events = []
|
||||
try:
|
||||
with zipfile.ZipFile(zip_path, "r") as zf:
|
||||
csv_files = [n for n in zf.namelist() if n.endswith(".csv")]
|
||||
if not csv_files:
|
||||
print(f" WARNING: no CSV in {zip_path.name}")
|
||||
return []
|
||||
with zf.open(csv_files[0]) as f:
|
||||
df = pd.read_csv(f)
|
||||
|
||||
# Validate expected columns
|
||||
required = {"times_utc", "sza", "ghi"}
|
||||
missing = required - set(df.columns)
|
||||
if missing:
|
||||
print(f" WARNING: missing columns {missing} in {zip_path.name}")
|
||||
return []
|
||||
|
||||
evts = extract_events_from_df(
|
||||
df, meta["lat"], meta["lon"],
|
||||
station_code, meta["name"], f"caelus_{year}",
|
||||
)
|
||||
return evts
|
||||
|
||||
except zipfile.BadZipFile:
|
||||
print(f" ERROR: corrupt zip {zip_path.name}")
|
||||
return []
|
||||
except Exception as e:
|
||||
print(f" ERROR {zip_path.name}: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def process_caelus(priority_codes: list[str] | None = None, max_years: int = 5) -> list[dict]:
|
||||
print("\n=== CAELUS dataset (Zenodo 7897639) ===")
|
||||
available = list_caelus_files()
|
||||
if not available:
|
||||
return []
|
||||
|
||||
if priority_codes is None:
|
||||
# All known stations, sorted by relevance
|
||||
priority_codes = list(CAELUS_STATION_META.keys())
|
||||
|
||||
all_events = []
|
||||
|
||||
for code in priority_codes:
|
||||
if code not in available:
|
||||
print(f" SKIP {code}: not in CAELUS")
|
||||
continue
|
||||
|
||||
meta = CAELUS_STATION_META.get(code)
|
||||
if meta is None:
|
||||
print(f" SKIP {code}: no metadata")
|
||||
continue
|
||||
|
||||
years = available[code][:max_years]
|
||||
print(f"\n {meta['name']} ({code}) "
|
||||
f"lat={meta['lat']} lon={meta['lon']} region={meta['region']}")
|
||||
print(f" Years: {years}")
|
||||
|
||||
station_events = []
|
||||
for year in years:
|
||||
fname = f"{code}_bsrn_{year}.zip"
|
||||
url = CAELUS_BASE_URL.format(filename=fname)
|
||||
dest = RAW_BSRN / fname
|
||||
|
||||
ok = download_file(url, dest, fname)
|
||||
if not ok:
|
||||
continue
|
||||
|
||||
print(f" {fname} ...", end=" ", flush=True)
|
||||
evts = process_caelus_zip(dest, code, meta, year)
|
||||
print(f"{len(evts)} events ({year})")
|
||||
station_events.extend(evts)
|
||||
|
||||
print(f" → {len(station_events)} events total for {code}")
|
||||
all_events.extend(station_events)
|
||||
|
||||
return all_events
|
||||
|
||||
|
||||
# ── SURFRAD ───────────────────────────────────────────────────────────────────
|
||||
|
||||
SURFRAD_STATIONS = {
|
||||
"dra": {"name": "Desert Rock", "lat": 36.626, "lon": -116.018, "dir": "DRA"},
|
||||
"tbl": {"name": "TableMountain","lat": 40.125, "lon": -105.237, "dir": "TBL"},
|
||||
}
|
||||
SURFRAD_BASE = "https://gml.noaa.gov/aftp/data/radiation/surfrad"
|
||||
|
||||
|
||||
def parse_surfrad_day(content: bytes) -> pd.DataFrame:
|
||||
"""Parse a SURFRAD daily .dat file to a dataframe with [times_utc, sza, ghi, dif]."""
|
||||
lines = content.decode("utf-8", errors="replace").splitlines()
|
||||
records = []
|
||||
for line in lines[2:]:
|
||||
parts = line.split()
|
||||
if len(parts) < 10:
|
||||
continue
|
||||
try:
|
||||
yr = int(parts[0])
|
||||
jday = int(parts[1])
|
||||
hr = int(parts[2])
|
||||
mn = int(parts[3])
|
||||
zen = float(parts[5]) # solar zenith angle (degrees)
|
||||
ghi = float(parts[6]) # downwelling shortwave (W/m²)
|
||||
dif = float(parts[9]) # diffuse shortwave (W/m²)
|
||||
|
||||
dt = datetime(yr, 1, 1, hr, mn, tzinfo=timezone.utc) + timedelta(days=jday - 1)
|
||||
|
||||
# -9999 = missing
|
||||
records.append({
|
||||
"times_utc": dt,
|
||||
"sza": None if zen < -900 else zen,
|
||||
"ghi": None if ghi < -900 else max(0.0, ghi),
|
||||
"dif": None if dif < -900 else max(0.0, dif),
|
||||
})
|
||||
except (ValueError, IndexError):
|
||||
continue
|
||||
return pd.DataFrame(records)
|
||||
|
||||
|
||||
def process_surfrad(years: list[int] | None = None) -> list[dict]:
|
||||
print("\n=== SURFRAD (NOAA GML) ===")
|
||||
if years is None:
|
||||
years = [2018, 2019, 2020]
|
||||
|
||||
all_events = []
|
||||
|
||||
for code, meta in SURFRAD_STATIONS.items():
|
||||
print(f"\n {meta['name']} ({code}) lat={meta['lat']} lon={meta['lon']}")
|
||||
dest_root = RAW_BSRN / "surfrad" / meta["dir"]
|
||||
dest_root.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
for year in years:
|
||||
year_url = f"{SURFRAD_BASE}/{meta['dir']}/{year}/"
|
||||
try:
|
||||
r = requests.get(year_url, timeout=30)
|
||||
if r.status_code != 200:
|
||||
print(f" {year}: HTTP {r.status_code} — skipping")
|
||||
continue
|
||||
# Parse file listing
|
||||
files = sorted(set(re.findall(rf'{code}\d{{5}}\.dat', r.text)))
|
||||
print(f" {year}: {len(files)} daily files", end=" ")
|
||||
except Exception as e:
|
||||
print(f" {year}: ERROR listing: {e}")
|
||||
continue
|
||||
|
||||
dest_dir = dest_root / str(year)
|
||||
dest_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
day_dfs = []
|
||||
for fname in files:
|
||||
dest = dest_dir / fname
|
||||
furl = f"{year_url}{fname}"
|
||||
ok = download_file(furl, dest, fname)
|
||||
if not ok:
|
||||
continue
|
||||
with open(dest, "rb") as f:
|
||||
day_df = parse_surfrad_day(f.read())
|
||||
if not day_df.empty:
|
||||
day_dfs.append(day_df)
|
||||
|
||||
if not day_dfs:
|
||||
print("→ 0 events")
|
||||
continue
|
||||
|
||||
df_year = pd.concat(day_dfs, ignore_index=True)
|
||||
evts = extract_events_from_df(
|
||||
df_year, meta["lat"], meta["lon"],
|
||||
code, meta["name"], f"surfrad_{year}",
|
||||
)
|
||||
print(f"→ {len(evts)} events")
|
||||
all_events.extend(evts)
|
||||
|
||||
return all_events
|
||||
|
||||
|
||||
# ── save / summarise ──────────────────────────────────────────────────────────
|
||||
|
||||
def save_events(events: list[dict], filename: str) -> Path:
|
||||
out = RAW_SIGHTINGS / filename
|
||||
if not events:
|
||||
print(f" (no events for {filename})")
|
||||
return out
|
||||
df = pd.DataFrame(events)
|
||||
df = df.sort_values(["station_code", "date", "event_type", "threshold_wm2"])
|
||||
df.to_csv(out, index=False)
|
||||
print(f" Saved {len(df):,} rows → {out}")
|
||||
return out
|
||||
|
||||
|
||||
def summarise(events: list[dict], label: str):
|
||||
print(f"\n{'='*65}")
|
||||
print(f"SUMMARY: {label}")
|
||||
if not events:
|
||||
print(" (no events)")
|
||||
print("=" * 65)
|
||||
return
|
||||
df = pd.DataFrame(events)
|
||||
# Focus summary on primary threshold
|
||||
prim = df[df["threshold_wm2"] == PRIMARY_THRESHOLD]
|
||||
print(f" All threshold events: {len(df):,}")
|
||||
print(f" At {PRIMARY_THRESHOLD} W/m² threshold: {len(prim):,}")
|
||||
print(f" Stations: {df['station_code'].nunique()} — {sorted(df['station_code'].unique())}")
|
||||
if not prim.empty:
|
||||
print(f" Date range: {prim['date'].min()} to {prim['date'].max()}")
|
||||
for evt_type in ["dawn", "dusk"]:
|
||||
sub = prim[prim["event_type"] == evt_type]
|
||||
if sub.empty:
|
||||
continue
|
||||
d = sub["solar_depression_deg"]
|
||||
print(f" {evt_type.upper()} events: {len(sub):,} | "
|
||||
f"mean dep={d.mean():.2f}° std={d.std():.2f}° "
|
||||
f"p10={d.quantile(0.1):.2f}° p90={d.quantile(0.9):.2f}°")
|
||||
print("=" * 65)
|
||||
|
||||
|
||||
# ── main ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
print("BSRN Twilight Extractor v2")
|
||||
print(f"Primary threshold: {PRIMARY_THRESHOLD} W/m²")
|
||||
print(f"All thresholds: {THRESHOLDS}")
|
||||
print()
|
||||
|
||||
all_events = []
|
||||
|
||||
# 1. BSRN 5-site
|
||||
five_events = process_five_site()
|
||||
save_events(five_events, "bsrn_5site_twilight.csv")
|
||||
summarise(five_events, "BSRN 5-site")
|
||||
all_events.extend(five_events)
|
||||
|
||||
# 2. CAELUS — priority stations in 20-45°N
|
||||
priority = ["sbo", "tam", "xia", "iza", "ish", "dra", "e13", "tat", "fua",
|
||||
"gcr", "bil", "abs", "lyu", "cnr", "flo"]
|
||||
caelus_events = process_caelus(priority_codes=priority, max_years=5)
|
||||
save_events(caelus_events, "bsrn_caelus_twilight.csv")
|
||||
summarise(caelus_events, "CAELUS")
|
||||
all_events.extend(caelus_events)
|
||||
|
||||
# 3. SURFRAD — Desert Rock (arid, 36.6°N) + Table Mountain
|
||||
surfrad_events = process_surfrad(years=[2018, 2019, 2020])
|
||||
save_events(surfrad_events, "surfrad_twilight.csv")
|
||||
summarise(surfrad_events, "SURFRAD")
|
||||
all_events.extend(surfrad_events)
|
||||
|
||||
# Combined output
|
||||
save_events(all_events, "bsrn_all_twilight.csv")
|
||||
summarise(all_events, "ALL SOURCES")
|
||||
|
||||
print("\nDone.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
93
src/collect/academic_paper_fetcher.py
Normal file
93
src/collect/academic_paper_fetcher.py
Normal file
|
|
@ -0,0 +1,93 @@
|
|||
import requests
|
||||
import os
|
||||
import re
|
||||
from pathlib import Path
|
||||
import time
|
||||
|
||||
PAPERS_DIR = Path("research/downloaded_papers")
|
||||
PAPERS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Queries targeting Islamic twilight observations
|
||||
QUERIES = [
|
||||
"fajr twilight observation angle",
|
||||
"subuh sky brightness meter",
|
||||
"shafaq twilight measurement isha",
|
||||
"depression angle fajr isha",
|
||||
"true dawn observation measurements"
|
||||
]
|
||||
|
||||
def fetch_openalex_papers(query):
|
||||
url = f"https://api.openalex.org/works?search={query}&per-page=50&filter=has_pdf_url:true"
|
||||
print(f"Querying OpenAlex: {query}")
|
||||
try:
|
||||
res = requests.get(url, timeout=15)
|
||||
if res.status_code != 200:
|
||||
return []
|
||||
data = res.json()
|
||||
return data.get("results", [])
|
||||
except Exception as e:
|
||||
print(f"Error fetching {query}: {e}")
|
||||
return []
|
||||
|
||||
def sanitize_filename(title):
|
||||
return re.sub(r"[^a-zA-Z0-9]+", "_", title).strip("_")
|
||||
|
||||
def main():
|
||||
seen_titles = set()
|
||||
total_downloaded = 0
|
||||
|
||||
# Check existing files to avoid duplicates
|
||||
existing_files = list(PAPERS_DIR.glob("*.pdf"))
|
||||
for file in existing_files:
|
||||
name_no_ext = file.stem.lower()
|
||||
seen_titles.add(name_no_ext)
|
||||
|
||||
for query in QUERIES:
|
||||
works = fetch_openalex_papers(query)
|
||||
for work in works:
|
||||
title = work.get("title")
|
||||
if not title:
|
||||
continue
|
||||
|
||||
clean_title = sanitize_filename(title).lower()
|
||||
|
||||
# Simple duplicate check
|
||||
if any(clean_title[:30] in seen[:30] for seen in seen_titles):
|
||||
print(f"Skipping duplicate/known: {title}")
|
||||
continue
|
||||
|
||||
pdf_url = work.get("open_access", {}).get("oa_url")
|
||||
if not pdf_url or not pdf_url.endswith(".pdf"):
|
||||
continue
|
||||
|
||||
year = work.get("publication_year", "0000")
|
||||
filename = f"{year}_{clean_title[:60]}.pdf"
|
||||
filepath = PAPERS_DIR / filename
|
||||
|
||||
if filepath.exists():
|
||||
print(f"Already downloaded: {filename}")
|
||||
continue
|
||||
|
||||
print(f"Downloading: {title} ({year})")
|
||||
print(f" URL: {pdf_url}")
|
||||
try:
|
||||
# Add headers to mimic browser
|
||||
headers = {'User-Agent': 'Mozilla/5.0'}
|
||||
pdf_res = requests.get(pdf_url, headers=headers, timeout=20)
|
||||
if pdf_res.status_code == 200:
|
||||
with open(filepath, "wb") as f:
|
||||
f.write(pdf_res.content)
|
||||
print(f" -> Saved as {filename}")
|
||||
seen_titles.add(clean_title)
|
||||
total_downloaded += 1
|
||||
else:
|
||||
print(f" -> Failed with status {pdf_res.status_code}")
|
||||
except Exception as e:
|
||||
print(f" -> Download failed: {e}")
|
||||
|
||||
time.sleep(1) # Be nice to servers
|
||||
|
||||
print(f"\nTotal new papers downloaded: {total_downloaded}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
226
src/collect/download_gan_mn.py
Normal file
226
src/collect/download_gan_mn.py
Normal file
|
|
@ -0,0 +1,226 @@
|
|||
"""
|
||||
Download all GaN-MN monthly and annual CSV files, then process each one
|
||||
with the GaN-MN processor and save results to data/raw/raw_sightings/.
|
||||
|
||||
Run from repo root:
|
||||
/Volumes/X9/Sites/acamarata/pray-calc-ml/.venv/bin/python \
|
||||
src/collect/download_gan_mn.py
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import csv
|
||||
import logging
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
|
||||
# Add repo root to path so we can import the processor
|
||||
REPO_ROOT = Path(__file__).resolve().parent.parent.parent
|
||||
sys.path.insert(0, str(REPO_ROOT))
|
||||
|
||||
from src.collect.gan_mn_processor import process_gan_mn_csv # noqa: E402
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [%(levelname)s] %(message)s",
|
||||
datefmt="%H:%M:%S",
|
||||
)
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
RAW_DIR = REPO_ROOT / "data" / "raw" / "gan_mn"
|
||||
SIGHTINGS_DIR = REPO_ROOT / "data" / "raw" / "raw_sightings"
|
||||
RAW_DIR.mkdir(parents=True, exist_ok=True)
|
||||
SIGHTINGS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# All monthly files, 2022-2025 (skip already-downloaded ones)
|
||||
# Format: (label, bit.ly URL, local filename)
|
||||
MONTHLY_FILES = [
|
||||
# 2025 (skip Jun 2025 and Jan 2025 — already have them)
|
||||
("Aug_2025", "https://bit.ly/4a9aXIP", "GaN-MN_August_2025.csv"),
|
||||
("Sep_2025", "https://bit.ly/4sm4pxY", "GaN-MN_September_2025.csv"),
|
||||
("Jul_2025", "https://bit.ly/4jlEQtz", "GaN-MN_July_2025.csv"),
|
||||
("May_2025", "https://bit.ly/47w5sCA", "GaN-MN_May_2025.csv"),
|
||||
("Apr_2025", "https://bit.ly/4nxs6Bm", "GaN-MN_April_2025.csv"),
|
||||
("Mar_2025", "https://bit.ly/47mPHzx", "GaN-MN_March_2025.csv"),
|
||||
("Feb_2025", "https://bit.ly/4lbwPXg", "GaN-MN_February_2025.csv"),
|
||||
# 2024 (skip Aug 2024 — already have it)
|
||||
("Dec_2024", "https://bit.ly/4mHSBno", "GaN-MN_December_2024.csv"),
|
||||
("Nov_2024", "https://bit.ly/4lVwidL", "GaN-MN_November_2024.csv"),
|
||||
("Oct_2024", "https://bit.ly/449h3ay", "GaN-MN_October_2024.csv"),
|
||||
("Sep_2024", "https://bit.ly/4hUE458", "GaN-MN_September_2024.csv"),
|
||||
("Jul_2024", "https://bit.ly/4fAjvJi", "GaN-MN_July_2024.csv"),
|
||||
("Jun_2024", "https://bit.ly/3ZvT1DR", "GaN-MN_June_2024.csv"),
|
||||
("May_2024", "https://bit.ly/4f3veR8", "GaN-MN_May_2024.csv"),
|
||||
("Apr_2024", "https://bit.ly/4etfYg0", "GaN-MN_April_2024.csv"),
|
||||
("Mar_2024", "https://bit.ly/3Z69pLy", "GaN-MN_March_2024.csv"),
|
||||
("Feb_2024", "https://bit.ly/46wtyfJ", "GaN-MN_February_2024.csv"),
|
||||
("Jan_2024", "https://bit.ly/4cMztz7", "GaN-MN_January_2024.csv"),
|
||||
# 2023
|
||||
("Dec_2023", "https://bit.ly/45dEDl3", "GaN-MN_December_2023.csv"),
|
||||
("Nov_2023", "https://bit.ly/4ajnV4v", "GaN-MN_November_2023.csv"),
|
||||
("Oct_2023", "https://bit.ly/3PKzryD", "GaN-MN_October_2023.csv"),
|
||||
("Sep_2023", "https://bit.ly/3ww3T8K", "GaN-MN_September_2023.csv"),
|
||||
("Aug_2023", "https://bit.ly/3So1UKV", "GaN-MN_August_2023.csv"),
|
||||
("Jul_2023", "https://bit.ly/3S3mG3u", "GaN-MN_July_2023.csv"),
|
||||
("Jun_2023", "https://bit.ly/46Es4OL", "GaN-MN_June_2023.csv"),
|
||||
("May_2023", "https://bit.ly/46WbF9m", "GaN-MN_May_2023.csv"),
|
||||
("Apr_2023", "https://bit.ly/3rxWWSy", "GaN-MN_April_2023.csv"),
|
||||
("Mar_2023", "https://bit.ly/3OW45nv", "GaN-MN_March_2023.csv"),
|
||||
("Feb_2023", "https://bit.ly/3rSY2br", "GaN-MN_February_2023.csv"),
|
||||
("Jan_2023", "https://bit.ly/3JGJpy1", "GaN-MN_January_2023.csv"),
|
||||
# 2022
|
||||
("Dec_2022", "https://bit.ly/43CelXB", "GaN-MN_December_2022.csv"),
|
||||
("Nov_2022", "https://bit.ly/429RdyR", "GaN-MN_November_2022.csv"),
|
||||
("Oct_2022", "https://bit.ly/3m1onBm", "GaN-MN_October_2022.csv"),
|
||||
("Sep_2022", "http://bit.ly/3SHQJN5", "GaN-MN_September_2022.csv"),
|
||||
("Aug_2022", "http://bit.ly/40hFpdP", "GaN-MN_August_2022.csv"),
|
||||
("Jul_2022", "https://bit.ly/3Z487OD", "GaN-MN_July_2022.csv"),
|
||||
("Jun_2022", "https://bit.ly/3H5CsWS", "GaN-MN_June_2022.csv"),
|
||||
("May_2022", "https://bit.ly/3fnFCJT", "GaN-MN_May_2022.csv"),
|
||||
("Apr_2022", "https://bit.ly/3E8flJr", "GaN-MN_April_2022.csv"),
|
||||
("Mar_2022", "https://bit.ly/3B76VAq", "GaN-MN_March_2022.csv"),
|
||||
("Feb_2022", "http://bit.ly/3BCkXe1", "GaN-MN_February_2022.csv"),
|
||||
("Jan_2022", "http://bit.ly/3IgfkU4", "GaN-MN_January_2022.csv"),
|
||||
]
|
||||
|
||||
# Annual file for 2021 (one representative year)
|
||||
ANNUAL_FILES = [
|
||||
("2021", "https://bit.ly/44vtt9T", "GaN-MN_Annual_2021.csv"),
|
||||
]
|
||||
|
||||
FIELDNAMES = [
|
||||
"prayer", "date_local", "time_local", "utc_offset",
|
||||
"lat", "lng", "elevation_m", "source", "notes",
|
||||
]
|
||||
|
||||
|
||||
def resolve_and_download(label: str, url: str, dest: Path) -> bool:
|
||||
"""
|
||||
Follow bit.ly redirect and download the file to dest.
|
||||
Returns True on success.
|
||||
"""
|
||||
if dest.exists():
|
||||
log.info("Already have %s — skipping download", dest.name)
|
||||
return True
|
||||
|
||||
log.info("Downloading %s from %s ...", label, url)
|
||||
try:
|
||||
# bit.ly does a redirect; requests follows by default
|
||||
headers = {
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/124.0.0.0 Safari/537.36"
|
||||
)
|
||||
}
|
||||
with requests.get(url, headers=headers, stream=True, timeout=120) as resp:
|
||||
resp.raise_for_status()
|
||||
total = int(resp.headers.get("content-length", 0))
|
||||
downloaded = 0
|
||||
with open(dest, "wb") as fh:
|
||||
for chunk in resp.iter_content(chunk_size=1 << 20): # 1 MB chunks
|
||||
if chunk:
|
||||
fh.write(chunk)
|
||||
downloaded += len(chunk)
|
||||
mb = downloaded / (1 << 20)
|
||||
log.info("Downloaded %s: %.1f MB", dest.name, mb)
|
||||
return True
|
||||
except Exception as exc:
|
||||
log.error("Failed to download %s: %s", label, exc)
|
||||
if dest.exists():
|
||||
dest.unlink()
|
||||
return False
|
||||
|
||||
|
||||
def process_and_save(label: str, csv_path: Path) -> int:
|
||||
"""
|
||||
Run the GaN-MN processor on csv_path and write results to
|
||||
data/raw/raw_sightings/gan_mn_{label_lower}.csv.
|
||||
Returns count of records written.
|
||||
"""
|
||||
slug = label.lower().replace(" ", "_")
|
||||
out_path = SIGHTINGS_DIR / f"gan_mn_{slug}.csv"
|
||||
|
||||
if out_path.exists():
|
||||
# Count existing rows
|
||||
with open(out_path) as fh:
|
||||
existing = sum(1 for _ in fh) - 1 # subtract header
|
||||
log.info("Already processed %s (%d rows) — skipping", out_path.name, existing)
|
||||
return existing
|
||||
|
||||
log.info("Processing %s ...", csv_path.name)
|
||||
try:
|
||||
records = process_gan_mn_csv(csv_path)
|
||||
except Exception as exc:
|
||||
log.error("Processing failed for %s: %s", csv_path.name, exc)
|
||||
return 0
|
||||
|
||||
if not records:
|
||||
log.warning("No records extracted from %s", csv_path.name)
|
||||
return 0
|
||||
|
||||
with open(out_path, "w", newline="") as fh:
|
||||
writer = csv.DictWriter(fh, fieldnames=FIELDNAMES)
|
||||
writer.writeheader()
|
||||
writer.writerows(records)
|
||||
|
||||
log.info("Wrote %d records to %s", len(records), out_path.name)
|
||||
return len(records)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
all_files = MONTHLY_FILES + ANNUAL_FILES
|
||||
total_downloaded = 0
|
||||
total_records = 0
|
||||
skipped_download = 0
|
||||
failed_download = []
|
||||
results = []
|
||||
|
||||
for label, url, filename in all_files:
|
||||
dest = RAW_DIR / filename
|
||||
ok = resolve_and_download(label, url, dest)
|
||||
if not ok:
|
||||
failed_download.append(label)
|
||||
continue
|
||||
if dest.stat().st_size < 1000:
|
||||
log.error("%s appears too small (%d bytes), skipping processing", filename, dest.stat().st_size)
|
||||
failed_download.append(label)
|
||||
continue
|
||||
|
||||
total_downloaded += 1
|
||||
count = process_and_save(label, dest)
|
||||
total_records += count
|
||||
results.append((label, count))
|
||||
|
||||
# Brief pause between files to avoid hammering the network
|
||||
time.sleep(0.5)
|
||||
|
||||
# Also count already-existing sightings from the 3 pre-downloaded months
|
||||
pre_existing = ["august_2024", "january_2025", "june_2025"]
|
||||
pre_total = 0
|
||||
for slug in pre_existing:
|
||||
p = SIGHTINGS_DIR / f"gan_mn_{slug}.csv"
|
||||
if p.exists():
|
||||
with open(p) as fh:
|
||||
n = sum(1 for _ in fh) - 1
|
||||
pre_total += n
|
||||
log.info("Pre-existing %s: %d rows", p.name, n)
|
||||
|
||||
log.info("=" * 60)
|
||||
log.info("Download summary:")
|
||||
log.info(" Files newly processed: %d", total_downloaded)
|
||||
log.info(" Records from new files: %d", total_records)
|
||||
log.info(" Records from pre-existing 3 months: %d", pre_total)
|
||||
log.info(" Grand total twilight events: %d", total_records + pre_total)
|
||||
if failed_download:
|
||||
log.warning(" Failed downloads: %s", ", ".join(failed_download))
|
||||
|
||||
log.info("Per-file breakdown:")
|
||||
for label, count in results:
|
||||
log.info(" %-15s %d", label, count)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
169
src/collect/madrid_sqm_processor.py
Normal file
169
src/collect/madrid_sqm_processor.py
Normal file
|
|
@ -0,0 +1,169 @@
|
|||
"""
|
||||
Processor for the Madrid Zenodo SQM dataset (Record 4633001).
|
||||
|
||||
This script reads data/raw/massive_sqm/SQM_evol.csv, representing 10 years of
|
||||
continuous minute-by-minute Sky Quality Meter readings in Madrid, Spain.
|
||||
|
||||
We apply the BRIN inflection-point detection algorithm:
|
||||
- Fajr: Moment of steepest SQM decline in the pre-dawn window.
|
||||
- Isha: Moment SQM reaches within 0.5 mag of the dark-night baseline.
|
||||
|
||||
Output: data/raw/raw_sightings/madrid_sqm_10yr.csv
|
||||
"""
|
||||
|
||||
import math
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timezone, timedelta
|
||||
import pandas as pd
|
||||
import ephem
|
||||
|
||||
# Madrid UCM Observatory from paper metadata
|
||||
LAT = 40.45
|
||||
LNG = -3.73
|
||||
ELEVATION_M = 640
|
||||
UTC_OFFSET = 1.0 # Winter UTC+1, though data is in UTC so we just use this for local labeling
|
||||
|
||||
RAW_CSV = Path("data/raw/massive_sqm/SQM_evol.csv")
|
||||
OUT_FILE = Path("data/raw/raw_sightings/madrid_sqm_10yr.csv")
|
||||
|
||||
MIN_DARK_SKY_MPSAS = 18.0
|
||||
MAX_MOON_ALT = 5.0
|
||||
|
||||
def get_solar_altitude(utc_dt: datetime) -> float:
|
||||
obs = ephem.Observer()
|
||||
obs.lat = str(LAT)
|
||||
obs.lon = str(LNG)
|
||||
obs.elevation = ELEVATION_M
|
||||
obs.pressure = 1013.25
|
||||
obs.temp = 15.0
|
||||
if utc_dt.tzinfo is not None:
|
||||
utc_dt = utc_dt.replace(tzinfo=None)
|
||||
obs.date = ephem.Date(utc_dt)
|
||||
sun = ephem.Sun(obs)
|
||||
return math.degrees(float(sun.alt))
|
||||
|
||||
def process_madrid_data():
|
||||
if not RAW_CSV.exists():
|
||||
print(f"File not found: {RAW_CSV}")
|
||||
return
|
||||
|
||||
print("Loading Madrid SQM dataset. This may take a minute...")
|
||||
# Format: Time(UTC);Magnitude
|
||||
df = pd.read_csv(RAW_CSV, sep=';', usecols=['Time(UTC)', 'Magnitude'])
|
||||
df['utc_dt'] = pd.to_datetime(df['Time(UTC)']).dt.tz_localize(timezone.utc)
|
||||
df = df.rename(columns={'Magnitude': 'mpsas'})
|
||||
|
||||
# Filter bad data
|
||||
df = df[df['mpsas'] > 0]
|
||||
df = df.sort_values('utc_dt').reset_index(drop=True)
|
||||
|
||||
print(f"Loaded {len(df)} rows. Processing nights...")
|
||||
|
||||
# Identify unique local calendar dates in the dataset
|
||||
df['local_date'] = (df['utc_dt'] + timedelta(hours=UTC_OFFSET)).dt.date
|
||||
unique_dates = df['local_date'].unique()
|
||||
|
||||
records = []
|
||||
|
||||
obs = ephem.Observer()
|
||||
obs.lat = str(LAT)
|
||||
obs.lon = str(LNG)
|
||||
obs.elevation = ELEVATION_M
|
||||
obs.pressure = 1013.25
|
||||
obs.temp = 15.0
|
||||
|
||||
print(f"Found {len(unique_dates)} nights to process.")
|
||||
for local_date in unique_dates:
|
||||
# Midnight UTC for the given local date
|
||||
noon_utc = datetime(local_date.year, local_date.month, local_date.day, 12, 0, tzinfo=timezone.utc) - timedelta(hours=UTC_OFFSET)
|
||||
|
||||
obs.date = ephem.Date(noon_utc.replace(tzinfo=None))
|
||||
try:
|
||||
sunset_utc = obs.next_setting(ephem.Sun()).datetime().replace(tzinfo=timezone.utc)
|
||||
sunrise_utc = obs.next_rising(ephem.Sun()).datetime().replace(tzinfo=timezone.utc)
|
||||
except ephem.AlwaysUpError:
|
||||
continue
|
||||
except ephem.NeverUpError:
|
||||
continue
|
||||
|
||||
# FAJR
|
||||
window_start = sunrise_utc - timedelta(hours=5)
|
||||
window_end = sunrise_utc - timedelta(minutes=10)
|
||||
predawn = df[(df['utc_dt'] >= window_start) & (df['utc_dt'] <= window_end)].copy()
|
||||
|
||||
if len(predawn) >= 30:
|
||||
predawn['sun_alt'] = [get_solar_altitude(dt) for dt in predawn['utc_dt']]
|
||||
predawn = predawn[predawn['sun_alt'] < -5.0]
|
||||
|
||||
early = predawn[predawn['sun_alt'] < -18.0]
|
||||
if early.empty:
|
||||
early = predawn[predawn['sun_alt'] < -16.0]
|
||||
if not early.empty and early['mpsas'].max() >= MIN_DARK_SKY_MPSAS:
|
||||
predawn['mpsas_smooth'] = predawn['mpsas'].rolling(window=5, center=True, min_periods=3).mean()
|
||||
predawn['dmpsas'] = predawn['mpsas_smooth'].diff()
|
||||
|
||||
active = predawn[(predawn['sun_alt'] >= -25.0) & (predawn['sun_alt'] <= -5.0)]
|
||||
if len(active) >= 10:
|
||||
steepest_idx = active['dmpsas'].idxmin()
|
||||
if not pd.isna(steepest_idx):
|
||||
dawn_row = predawn.loc[steepest_idx]
|
||||
depression_angle = -dawn_row['sun_alt']
|
||||
|
||||
if 10.0 <= depression_angle <= 22.0:
|
||||
local_time = dawn_row['utc_dt'] + timedelta(hours=UTC_OFFSET)
|
||||
records.append({
|
||||
"prayer": "fajr",
|
||||
"date_local": local_time.strftime("%Y-%m-%d"),
|
||||
"time_local": local_time.strftime("%H:%M"),
|
||||
"utc_offset": UTC_OFFSET,
|
||||
"lat": LAT,
|
||||
"lng": LNG,
|
||||
"elevation_m": ELEVATION_M,
|
||||
"source": "Madrid Zenodo SQM (10-year)",
|
||||
"notes": f"SQM inflection point Fajr detection; Madrid UCM Observatory; dark-sky baseline>={MIN_DARK_SKY_MPSAS}"
|
||||
})
|
||||
|
||||
# ISHA
|
||||
window_end_isha = sunset_utc + timedelta(hours=5)
|
||||
evening = df[(df['utc_dt'] >= sunset_utc) & (df['utc_dt'] <= window_end_isha)].copy()
|
||||
|
||||
if len(evening) >= 30:
|
||||
evening['sun_alt'] = [get_solar_altitude(dt) for dt in evening['utc_dt']]
|
||||
evening = evening[evening['sun_alt'] < 0]
|
||||
|
||||
deep_night = evening[(evening['utc_dt'] >= sunset_utc + timedelta(hours=2)) &
|
||||
(evening['utc_dt'] <= sunset_utc + timedelta(hours=6)) &
|
||||
(evening['sun_alt'] < -18.0)]
|
||||
|
||||
if len(deep_night) >= 10:
|
||||
dark_baseline = deep_night['mpsas'].median()
|
||||
if dark_baseline >= MIN_DARK_SKY_MPSAS:
|
||||
evening['mpsas_smooth'] = evening['mpsas'].rolling(window=5, center=True, min_periods=3).mean()
|
||||
threshold = dark_baseline - 0.5
|
||||
|
||||
reached = evening[(evening['mpsas_smooth'] >= threshold) & (evening['sun_alt'] < -12.0)]
|
||||
if not reached.empty:
|
||||
isha_row = reached.iloc[0]
|
||||
depression_angle = -isha_row['sun_alt']
|
||||
|
||||
if 12.0 <= depression_angle <= 22.0:
|
||||
local_time = isha_row['utc_dt'] + timedelta(hours=UTC_OFFSET)
|
||||
records.append({
|
||||
"prayer": "isha",
|
||||
"date_local": local_time.strftime("%Y-%m-%d"),
|
||||
"time_local": local_time.strftime("%H:%M"),
|
||||
"utc_offset": UTC_OFFSET,
|
||||
"lat": LAT,
|
||||
"lng": LNG,
|
||||
"elevation_m": ELEVATION_M,
|
||||
"source": "Madrid Zenodo SQM (10-year)",
|
||||
"notes": f"SQM threshold Isha detection (Shafaq Abyad); dark-sky baseline={dark_baseline:.2f}"
|
||||
})
|
||||
|
||||
out_df = pd.DataFrame(records)
|
||||
OUT_FILE.parent.mkdir(parents=True, exist_ok=True)
|
||||
out_df.to_csv(OUT_FILE, index=False)
|
||||
print(f"Successfully processed {len(records)} Fajr/Isha observations to {OUT_FILE}.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
process_madrid_data()
|
||||
282
src/surfrad_process.py
Normal file
282
src/surfrad_process.py
Normal file
|
|
@ -0,0 +1,282 @@
|
|||
"""
|
||||
SURFRAD Downloader and Twilight Extractor — standalone.
|
||||
|
||||
Stations:
|
||||
- dra: Desert Rock, Nevada (36.626°N, 116.018°W) — arid, excellent visibility
|
||||
- tbl: Table Mountain, Colorado (40.125°N, 105.237°W) — high altitude (1689m)
|
||||
- gwn: Goodwin Creek, Mississippi (34.255°N, 89.873°W) — humid subtropical
|
||||
|
||||
File naming: {code}{YY}{DDD}.dat
|
||||
YY = 2-digit year
|
||||
DDD = day of year (001-366)
|
||||
|
||||
Column layout (SURFRAD standard format, v1):
|
||||
0: year
|
||||
1: jday (day of year)
|
||||
2: month
|
||||
3: day
|
||||
4: hour (UTC)
|
||||
5: min
|
||||
6: dt (time difference in minutes from standard time)
|
||||
7: zen (solar zenith angle, degrees)
|
||||
8: dw_solar (downwelling shortwave, W/m²) ← GHI
|
||||
9: dw_solar_sd
|
||||
10: direct (direct normal irradiance, W/m²)
|
||||
11: direct_sd
|
||||
12: diffuse (diffuse horizontal, W/m²) ← DIF
|
||||
13: diffuse_sd
|
||||
14+: other measurements
|
||||
|
||||
Missing value: -9999.9
|
||||
"""
|
||||
|
||||
import re
|
||||
import math
|
||||
import requests
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from datetime import datetime, timezone, timedelta, date
|
||||
from pathlib import Path
|
||||
|
||||
BASE = Path("/Volumes/X9/Sites/acamarata/pray-calc-ml")
|
||||
RAW_BSRN = BASE / "data/raw/bsrn"
|
||||
RAW_SIGHTINGS = BASE / "data/raw/raw_sightings"
|
||||
|
||||
THRESHOLDS = [0.5, 1.0, 2.0, 5.0, 10.0]
|
||||
SURFRAD_BASE = "https://gml.noaa.gov/aftp/data/radiation/surfrad"
|
||||
|
||||
STATIONS = {
|
||||
"dra": {"name": "Desert Rock", "lat": 36.626, "lon": -116.018, "dir": "dra"},
|
||||
"tbl": {"name": "TableMountain", "lat": 40.125, "lon": -105.237, "dir": "tbl"},
|
||||
"gwn": {"name": "GoodwinCreek", "lat": 34.2547, "lon": -89.8729, "dir": "gwn"},
|
||||
}
|
||||
|
||||
|
||||
def download_file(url: str, dest: Path) -> bool:
|
||||
if dest.exists() and dest.stat().st_size > 500:
|
||||
return True
|
||||
try:
|
||||
r = requests.get(url, timeout=60)
|
||||
r.raise_for_status()
|
||||
dest.write_bytes(r.content)
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def parse_surfrad_dat(content: bytes) -> pd.DataFrame:
|
||||
"""
|
||||
Parse SURFRAD .dat file.
|
||||
Returns df with [times_utc, sza, ghi, dif].
|
||||
"""
|
||||
lines = content.decode("utf-8", errors="replace").splitlines()
|
||||
|
||||
# First 2 lines are headers; data starts line 2
|
||||
records = []
|
||||
for line in lines[2:]:
|
||||
parts = line.split()
|
||||
if len(parts) < 13:
|
||||
continue
|
||||
try:
|
||||
yr = int(parts[0])
|
||||
jday = int(parts[1])
|
||||
hr = int(parts[4])
|
||||
mn = int(parts[5])
|
||||
zen = float(parts[7])
|
||||
ghi = float(parts[8])
|
||||
dif = float(parts[12])
|
||||
|
||||
dt = datetime(yr, 1, 1, hr, mn, tzinfo=timezone.utc) + timedelta(days=jday - 1)
|
||||
records.append({
|
||||
"times_utc": dt,
|
||||
"sza": None if zen < -900 else zen,
|
||||
"ghi": None if ghi < -900 else max(0.0, ghi),
|
||||
"dif": None if dif < -900 else max(0.0, dif),
|
||||
})
|
||||
except (ValueError, IndexError):
|
||||
continue
|
||||
|
||||
return pd.DataFrame(records)
|
||||
|
||||
|
||||
def extract_events(df: pd.DataFrame, lat: float, lon: float,
|
||||
station_code: str, station_name: str,
|
||||
source: str) -> list[dict]:
|
||||
"""Same logic as bsrn_process.py extract_events_from_df."""
|
||||
events = []
|
||||
df = df.copy()
|
||||
df["times_utc"] = pd.to_datetime(df["times_utc"], utc=True, errors="coerce")
|
||||
df = df.dropna(subset=["times_utc"]).sort_values("times_utc").reset_index(drop=True)
|
||||
|
||||
# Use diffuse as primary signal, fall back to GHI
|
||||
df["irr"] = pd.to_numeric(df.get("dif", pd.Series()), errors="coerce")
|
||||
if "ghi" in df.columns:
|
||||
df["irr"] = df["irr"].combine_first(pd.to_numeric(df["ghi"], errors="coerce"))
|
||||
|
||||
df["sza"] = pd.to_numeric(df["sza"], errors="coerce")
|
||||
df["date"] = df["times_utc"].dt.date
|
||||
|
||||
for day, grp in df.groupby("date"):
|
||||
grp = grp.sort_values("times_utc").reset_index(drop=True)
|
||||
irr = grp["irr"].values
|
||||
sza = grp["sza"].values
|
||||
ts = grp["times_utc"].values
|
||||
n = len(irr)
|
||||
if n < 10:
|
||||
continue
|
||||
|
||||
for threshold in THRESHOLDS:
|
||||
dawn_idx = None
|
||||
for i in range(2, n):
|
||||
if (not np.isnan(irr[i]) and irr[i] >= threshold
|
||||
and not np.isnan(sza[i]) and sza[i] > 90.0):
|
||||
prev_ok = all(
|
||||
(np.isnan(v) or v < threshold)
|
||||
for v in irr[max(0, i-3):i]
|
||||
)
|
||||
if prev_ok:
|
||||
dawn_idx = i
|
||||
break
|
||||
|
||||
dusk_idx = None
|
||||
for i in range(n - 3, -1, -1):
|
||||
if (not np.isnan(irr[i]) and irr[i] >= threshold
|
||||
and not np.isnan(sza[i]) and sza[i] > 90.0):
|
||||
after_ok = all(
|
||||
(np.isnan(v) or v < threshold)
|
||||
for v in irr[i+1:min(n, i+4)]
|
||||
)
|
||||
if after_ok:
|
||||
dusk_idx = i
|
||||
break
|
||||
|
||||
for idx, event_type in [(dawn_idx, "dawn"), (dusk_idx, "dusk")]:
|
||||
if idx is None:
|
||||
continue
|
||||
sza_val = float(sza[idx])
|
||||
if sza_val <= 90.0:
|
||||
continue
|
||||
dep = round(sza_val - 90.0, 4)
|
||||
if dep < 0.3 or dep > 20.0:
|
||||
continue
|
||||
utc_ts = pd.Timestamp(ts[idx])
|
||||
irr_val = float(irr[idx])
|
||||
ghi_at = float(grp["ghi"].iloc[idx]) if "ghi" in grp.columns else float("nan")
|
||||
dif_at = float(grp["dif"].iloc[idx]) if "dif" in grp.columns else float("nan")
|
||||
|
||||
events.append({
|
||||
"date": str(day),
|
||||
"event_type": event_type,
|
||||
"utc_time": utc_ts.strftime("%H:%M:%S"),
|
||||
"sza_deg": round(sza_val, 4),
|
||||
"solar_depression_deg": dep,
|
||||
"irr_wm2": round(irr_val, 2),
|
||||
"ghi_wm2": round(ghi_at, 2) if not math.isnan(ghi_at) else None,
|
||||
"dif_wm2": round(dif_at, 2) if not math.isnan(dif_at) else None,
|
||||
"threshold_wm2": threshold,
|
||||
"lat": lat,
|
||||
"lon": lon,
|
||||
"station_code": station_code,
|
||||
"station_name": station_name,
|
||||
"source": source,
|
||||
})
|
||||
|
||||
return events
|
||||
|
||||
|
||||
def process_station(code: str, meta: dict, years: list[int]) -> list[dict]:
|
||||
print(f"\n {meta['name']} ({code}) lat={meta['lat']} lon={meta['lon']}")
|
||||
dir_code = meta["dir"]
|
||||
dest_root = RAW_BSRN / "surfrad" / dir_code
|
||||
dest_root.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
all_events = []
|
||||
for year in years:
|
||||
yy = str(year)[2:] # 2-digit year
|
||||
# Build list of days
|
||||
import calendar
|
||||
days_in_year = 366 if calendar.isleap(year) else 365
|
||||
|
||||
dest_dir = dest_root / str(year)
|
||||
dest_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
day_dfs = []
|
||||
success_count = 0
|
||||
for doy in range(1, days_in_year + 1):
|
||||
fname = f"{dir_code}{yy}{doy:03d}.dat"
|
||||
dest = dest_dir / fname
|
||||
url = f"{SURFRAD_BASE}/{dir_code}/{year}/{fname}"
|
||||
|
||||
ok = download_file(url, dest)
|
||||
if not ok:
|
||||
continue
|
||||
with open(dest, "rb") as f:
|
||||
day_df = parse_surfrad_dat(f.read())
|
||||
if not day_df.empty:
|
||||
day_dfs.append(day_df)
|
||||
success_count += 1
|
||||
|
||||
if not day_dfs:
|
||||
print(f" {year}: 0 days downloaded")
|
||||
continue
|
||||
|
||||
df_year = pd.concat(day_dfs, ignore_index=True)
|
||||
evts = extract_events(df_year, meta["lat"], meta["lon"],
|
||||
code, meta["name"], f"surfrad_{year}")
|
||||
print(f" {year}: {success_count} days → {len(evts)} twilight events "
|
||||
f"({len(df_year):,} 1-min records)")
|
||||
all_events.extend(evts)
|
||||
|
||||
return all_events
|
||||
|
||||
|
||||
def main():
|
||||
print("SURFRAD Twilight Extractor")
|
||||
years = [2018, 2019, 2020]
|
||||
print(f"Years: {years}")
|
||||
print(f"Stations: {list(STATIONS.keys())}")
|
||||
|
||||
all_events = []
|
||||
for code, meta in STATIONS.items():
|
||||
evts = process_station(code, meta, years)
|
||||
all_events.extend(evts)
|
||||
print(f" → {len(evts)} total events for {code}")
|
||||
|
||||
# Save
|
||||
out = RAW_SIGHTINGS / "surfrad_twilight.csv"
|
||||
if all_events:
|
||||
df = pd.DataFrame(all_events)
|
||||
df = df.sort_values(["station_code", "date", "event_type", "threshold_wm2"])
|
||||
df.to_csv(out, index=False)
|
||||
print(f"\nSaved {len(df):,} rows → {out}")
|
||||
|
||||
prim = df[df["threshold_wm2"] == 1.0]
|
||||
print(f"\nSUMMARY (1.0 W/m² threshold):")
|
||||
print(f" Total: {len(prim):,} events")
|
||||
for evt in ["dawn", "dusk"]:
|
||||
sub = prim[prim["event_type"] == evt]
|
||||
if sub.empty:
|
||||
continue
|
||||
d = sub["solar_depression_deg"]
|
||||
print(f" {evt.upper()}: {len(sub):,} | "
|
||||
f"mean={d.mean():.2f}° p10={d.quantile(0.1):.2f}° "
|
||||
f"p90={d.quantile(0.9):.2f}°")
|
||||
else:
|
||||
print("No events extracted.")
|
||||
|
||||
# Merge into bsrn_all_twilight.csv
|
||||
all_file = RAW_SIGHTINGS / "bsrn_all_twilight.csv"
|
||||
if all_file.exists() and all_events:
|
||||
df_existing = pd.read_csv(all_file)
|
||||
# Remove old surfrad rows if any
|
||||
df_existing = df_existing[~df_existing["source"].str.startswith("surfrad", na=False)]
|
||||
df_new = pd.concat([df_existing, pd.DataFrame(all_events)], ignore_index=True)
|
||||
df_new = df_new.sort_values(["station_code", "date", "event_type", "threshold_wm2"])
|
||||
df_new.to_csv(all_file, index=False)
|
||||
print(f"\nUpdated bsrn_all_twilight.csv → {len(df_new):,} total rows")
|
||||
|
||||
print("\nDone.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
204
src/surfrad_process2.py
Normal file
204
src/surfrad_process2.py
Normal file
|
|
@ -0,0 +1,204 @@
|
|||
"""
|
||||
SURFRAD processor — reads already-downloaded files in DRA/ and TBL/ dirs.
|
||||
Also downloads remaining missing days.
|
||||
"""
|
||||
|
||||
import math
|
||||
import requests
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import calendar
|
||||
from datetime import datetime, timezone, timedelta
|
||||
from pathlib import Path
|
||||
|
||||
BASE = Path("/Volumes/X9/Sites/acamarata/pray-calc-ml")
|
||||
RAW_BSRN = BASE / "data/raw/bsrn"
|
||||
RAW_SIGHTINGS = BASE / "data/raw/raw_sightings"
|
||||
THRESHOLDS = [0.5, 1.0, 2.0, 5.0, 10.0]
|
||||
SURFRAD_BASE = "https://gml.noaa.gov/aftp/data/radiation/surfrad"
|
||||
|
||||
STATIONS = {
|
||||
"dra": {"name": "Desert Rock", "lat": 36.626, "lon": -116.018, "dir_code": "DRA", "file_prefix": "dra"},
|
||||
"tbl": {"name": "TableMountain", "lat": 40.125, "lon": -105.237, "dir_code": "TBL", "file_prefix": "tbl"},
|
||||
}
|
||||
YEARS = [2018, 2019, 2020]
|
||||
|
||||
|
||||
def download_file(url: str, dest: Path) -> bool:
|
||||
if dest.exists() and dest.stat().st_size > 200:
|
||||
return True
|
||||
try:
|
||||
r = requests.get(url, timeout=30)
|
||||
if r.status_code != 200:
|
||||
return False
|
||||
dest.write_bytes(r.content)
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def parse_surfrad_dat(fpath: Path) -> pd.DataFrame:
|
||||
lines = fpath.read_bytes().decode("utf-8", errors="replace").splitlines()
|
||||
records = []
|
||||
for line in lines[2:]:
|
||||
parts = line.split()
|
||||
if len(parts) < 13:
|
||||
continue
|
||||
try:
|
||||
yr = int(parts[0])
|
||||
jday = int(parts[1])
|
||||
hr = int(parts[4])
|
||||
mn = int(parts[5])
|
||||
zen = float(parts[7])
|
||||
ghi = float(parts[8])
|
||||
dif = float(parts[12])
|
||||
dt = datetime(yr, 1, 1, hr, mn, tzinfo=timezone.utc) + timedelta(days=jday - 1)
|
||||
records.append({
|
||||
"times_utc": dt,
|
||||
"sza": None if zen < -900 else zen,
|
||||
"ghi": None if ghi < -900 else max(0.0, ghi),
|
||||
"dif": None if dif < -900 else max(0.0, dif),
|
||||
})
|
||||
except (ValueError, IndexError):
|
||||
continue
|
||||
return pd.DataFrame(records)
|
||||
|
||||
|
||||
def extract_events(df, lat, lon, code, name, source):
|
||||
events = []
|
||||
df = df.copy()
|
||||
df["times_utc"] = pd.to_datetime(df["times_utc"], utc=True, errors="coerce")
|
||||
df = df.dropna(subset=["times_utc"]).sort_values("times_utc").reset_index(drop=True)
|
||||
df["irr"] = pd.to_numeric(df.get("dif", pd.Series(dtype=float)), errors="coerce")
|
||||
if "ghi" in df.columns:
|
||||
df["irr"] = df["irr"].combine_first(pd.to_numeric(df["ghi"], errors="coerce"))
|
||||
df["sza"] = pd.to_numeric(df["sza"], errors="coerce")
|
||||
df["date"] = df["times_utc"].dt.date
|
||||
|
||||
for day, grp in df.groupby("date"):
|
||||
grp = grp.sort_values("times_utc").reset_index(drop=True)
|
||||
irr = grp["irr"].values
|
||||
sza = grp["sza"].values
|
||||
ts = grp["times_utc"].values
|
||||
n = len(irr)
|
||||
if n < 10:
|
||||
continue
|
||||
for threshold in THRESHOLDS:
|
||||
dawn_idx = None
|
||||
for i in range(2, n):
|
||||
if (not np.isnan(irr[i]) and irr[i] >= threshold
|
||||
and not np.isnan(sza[i]) and sza[i] > 90.0
|
||||
and all((np.isnan(v) or v < threshold) for v in irr[max(0,i-3):i])):
|
||||
dawn_idx = i
|
||||
break
|
||||
dusk_idx = None
|
||||
for i in range(n-3, -1, -1):
|
||||
if (not np.isnan(irr[i]) and irr[i] >= threshold
|
||||
and not np.isnan(sza[i]) and sza[i] > 90.0
|
||||
and all((np.isnan(v) or v < threshold) for v in irr[i+1:min(n,i+4)])):
|
||||
dusk_idx = i
|
||||
break
|
||||
for idx, etype in [(dawn_idx, "dawn"), (dusk_idx, "dusk")]:
|
||||
if idx is None:
|
||||
continue
|
||||
sza_val = float(sza[idx])
|
||||
if sza_val <= 90.0:
|
||||
continue
|
||||
dep = round(sza_val - 90.0, 4)
|
||||
if dep < 0.3 or dep > 20.0:
|
||||
continue
|
||||
utc_ts = pd.Timestamp(ts[idx])
|
||||
ghi_at = float(grp["ghi"].iloc[idx]) if "ghi" in grp.columns else float("nan")
|
||||
dif_at = float(grp["dif"].iloc[idx]) if "dif" in grp.columns else float("nan")
|
||||
events.append({
|
||||
"date": str(day), "event_type": etype,
|
||||
"utc_time": utc_ts.strftime("%H:%M:%S"),
|
||||
"sza_deg": round(sza_val, 4),
|
||||
"solar_depression_deg": dep,
|
||||
"irr_wm2": round(float(irr[idx]), 2),
|
||||
"ghi_wm2": round(ghi_at, 2) if not math.isnan(ghi_at) else None,
|
||||
"dif_wm2": round(dif_at, 2) if not math.isnan(dif_at) else None,
|
||||
"threshold_wm2": threshold,
|
||||
"lat": lat, "lon": lon,
|
||||
"station_code": code, "station_name": name, "source": source,
|
||||
})
|
||||
return events
|
||||
|
||||
|
||||
def main():
|
||||
print("SURFRAD Processor v2 — using cached downloads + filling gaps")
|
||||
all_events = []
|
||||
|
||||
for code, meta in STATIONS.items():
|
||||
print(f"\n{meta['name']} ({code})")
|
||||
dir_code = meta["dir_code"]
|
||||
prefix = meta["file_prefix"]
|
||||
dest_root = RAW_BSRN / "surfrad" / dir_code
|
||||
|
||||
for year in YEARS:
|
||||
yy = str(year)[2:]
|
||||
days_in_year = 366 if calendar.isleap(year) else 365
|
||||
dest_dir = dest_root / str(year)
|
||||
dest_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Download any missing days
|
||||
missing = 0
|
||||
for doy in range(1, days_in_year + 1):
|
||||
fname = f"{prefix}{yy}{doy:03d}.dat"
|
||||
dest = dest_dir / fname
|
||||
if dest.exists() and dest.stat().st_size > 200:
|
||||
continue
|
||||
url = f"{SURFRAD_BASE}/{prefix}/{year}/{fname}"
|
||||
if download_file(url, dest):
|
||||
missing += 1
|
||||
|
||||
# Parse all available files
|
||||
day_dfs = []
|
||||
for fpath in sorted(dest_dir.glob(f"{prefix}{yy}*.dat")):
|
||||
day_df = parse_surfrad_dat(fpath)
|
||||
if not day_df.empty:
|
||||
day_dfs.append(day_df)
|
||||
|
||||
if not day_dfs:
|
||||
print(f" {year}: no data files found")
|
||||
continue
|
||||
|
||||
df_year = pd.concat(day_dfs, ignore_index=True)
|
||||
evts = extract_events(df_year, meta["lat"], meta["lon"],
|
||||
code, meta["name"], f"surfrad_{year}")
|
||||
print(f" {year}: {len(day_dfs)} days, {len(df_year):,} records → {len(evts)} events (downloaded {missing} new files)")
|
||||
all_events.extend(evts)
|
||||
|
||||
out = RAW_SIGHTINGS / "surfrad_twilight.csv"
|
||||
if all_events:
|
||||
df = pd.DataFrame(all_events).sort_values(
|
||||
["station_code", "date", "event_type", "threshold_wm2"])
|
||||
df.to_csv(out, index=False)
|
||||
print(f"\nSaved {len(df):,} rows → {out}")
|
||||
|
||||
prim = df[df["threshold_wm2"] == 1.0]
|
||||
print(f"\nSummary at 1 W/m²: {len(prim):,} events")
|
||||
for etype in ["dawn", "dusk"]:
|
||||
sub = prim[prim["event_type"] == etype]
|
||||
if not sub.empty:
|
||||
d = sub["solar_depression_deg"]
|
||||
print(f" {etype}: {len(sub):,} | mean={d.mean():.2f}° "
|
||||
f"p10={d.quantile(0.1):.2f}° p90={d.quantile(0.9):.2f}°")
|
||||
|
||||
# Update combined file
|
||||
combined = RAW_SIGHTINGS / "bsrn_all_twilight.csv"
|
||||
if combined.exists():
|
||||
df_all = pd.read_csv(combined)
|
||||
df_all = df_all[~df_all["source"].str.startswith("surfrad", na=False)]
|
||||
df_all = pd.concat([df_all, pd.DataFrame(all_events)], ignore_index=True)
|
||||
df_all = df_all.sort_values(["station_code", "date", "event_type", "threshold_wm2"])
|
||||
df_all.to_csv(combined, index=False)
|
||||
print(f"Updated bsrn_all_twilight.csv → {len(df_all):,} total rows")
|
||||
else:
|
||||
print("No events.")
|
||||
|
||||
print("\nDone.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Reference in a new issue