pray-calc-ml/scripts/washetdonker_download.py
Aric Camarata ada08e7ec4 data: expand dataset from 5.9k to 91k records via 6 new SQM sources
Add 6 new data collection pipelines and their processed outputs:

Sources added:
- TESS/Stars4All photometer network: 37 months (Jun 2017-Aug 2020),
  ~40k raw events from 100+ European stations via Zenodo archives
- Globe at Night citizen science: 26k twilight observations (2006-2024),
  filtered from 308k total observations for solar depression 6-22 deg
- GaN-MN continuous monitoring: 45 months (Jan 2022-Sep 2025),
  ~12.5k twilight events from 88 stations across 20+ countries
- Galicia SQM network: 14 stations, 1-min resolution, 7.5k events
- Madrid/Majadahonda SQM: multi-year continuous monitoring, 3.1k events
- washetdonker.nl Netherlands: 7 stations, 3.3k morning events
- Academic papers: Jordan (Abed 2015), Fayum Egypt, India photometer

Pipeline changes:
- ingest.py: add all new files to APPROVED_RAW_CSVS allowlist,
  fix filter to use allowlist instead of hardcoded exclusions
- .gitignore: exclude bulk raw data directories (BSRN, TESS, GaN-MN,
  washetdonker, Globe at Night downloads)

Final dataset: 56,668 Fajr + 34,763 Isha = 91,431 total records
Previous: 5,871 Fajr + 46 Isha = 5,917 total records
2026-03-22 16:39:29 -04:00

195 lines
6.2 KiB
Python

"""
Download SQM .dat files from the washetdonker.nl network.
The site exposes ~90 stations across the Netherlands, Germany, and nearby islands.
Each station has daily files at:
https://www.washetdonker.nl/data/{StationName}/{YYYY}/{MM}/{YYYYMMDD}_120000_SQM-{StationName}.dat
Station metadata (lat, lng) is embedded in each file header.
Usage:
python scripts/washetdonker_download.py [--year 2023] [--station Texel] [--all]
python scripts/washetdonker_download.py --sample # 1 year for all stations
python scripts/washetdonker_download.py --full # all years for all stations
Files are saved to:
data/raw/washetdonker/{StationName}/{YYYY}/{MM}/{filename}.dat
"""
from __future__ import annotations
import argparse
import logging
import re
import time
from datetime import date, timedelta
from pathlib import Path
import requests
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(message)s",
datefmt="%H:%M:%S",
)
log = logging.getLogger(__name__)
BASE_URL = "https://www.washetdonker.nl/data"
REPO_ROOT = Path(__file__).parent.parent
OUT_ROOT = REPO_ROOT / "data" / "raw" / "washetdonker"
# All known stations enumerated from the washetdonker.nl index
# Ordered roughly north to south / by known data quality
ALL_STATIONS = [
"Texel", "Texel-Waalenburg", "Texel-puur",
"Vlieland-Oost", "Vliehors",
"Terschelling", "Natuurschuur-Terschelling",
"Ameland-Natuurcentrum-Nes",
"Schiermonnikoog", "Schiermonnikoog-centrum", "Schiermonnikoog-dorp",
"Griend",
"Spitsbergen",
"Marker-Wadden",
"Mandoe",
"WEC-Lauwersoog", "Lauwersoog", "Lauwersoog-haven", "Lauwersmeer-extra",
"Noordpolderzijl",
"Roodeschool", "Oudeschip", "Termunten", "Farmsum", "Eemshaven",
"Groningen-ZernikeCampus", "Groningen-Selwerd", "Groningen-DeHeld",
"ZwarteHaan", "Zuidbroek", "Sappemeer",
"Holwerd-1", "Holwerd-2", "Moddergat",
"Bartlehiem", "Burgum", "Aldeboarn", "Akkrum",
"Heerenveen01", "Heerenveen-Station",
"Gorredijk", "Katlijk",
"tZandt",
"Hornhuizen", "Boerakker", "Tolbert",
"Reitdiep",
"Sellingen", "Erica",
"Lemmer",
"DenHelder", "Hippolytushoef",
"Harlingen-Noord",
"Radio-Kootwijk", "Kootwijk-dorp",
"Hulshorst",
"Haaksbergen", "Lochem",
"Delft", "Rijswijk",
"Artis-Amsterdam", "Westpoort",
"Leiden-Sterrewacht",
"DeZilk", "Oostkapelle",
"Weerribben",
"Westhoek",
"Solarcity",
"ObsonWheels",
# German stations
"Borkum", "Borkum-hafen", "Borkum-Ostland",
"Emden", "Norddeich-eins", "Norddeich-zwei", "Norddeich-Flugplatz",
"Oldenburg",
"Spiekeroog", "Burlage",
# Danish
"Ribe",
# Rankings page (not a station — skip)
# "ranking",
]
def build_url(station: str, year: int, month: int, day: int) -> str:
d = date(year, month, day)
filename = f"{d.strftime('%Y%m%d')}_120000_SQM-{station}.dat"
return f"{BASE_URL}/{station}/{year}/{month:02d}/{filename}"
def out_path(station: str, year: int, month: int, day: int) -> Path:
d = date(year, month, day)
filename = f"{d.strftime('%Y%m%d')}_120000_SQM-{station}.dat"
return OUT_ROOT / station / str(year) / f"{month:02d}" / filename
def download_file(url: str, dest: Path, session: requests.Session) -> bool:
"""Download url to dest. Returns True if file written, False on skip/error."""
if dest.exists() and dest.stat().st_size > 500:
return False # already downloaded
try:
resp = session.get(url, timeout=30)
if resp.status_code == 404:
return False
resp.raise_for_status()
except requests.RequestException as exc:
log.debug(" FAIL %s: %s", url, exc)
return False
# Sanity: must start with our known header marker
content = resp.content
if not content.startswith(b"# Definition"):
log.debug(" SKIP (bad content): %s", url)
return False
dest.parent.mkdir(parents=True, exist_ok=True)
dest.write_bytes(content)
return True
def iter_dates(year: int):
"""Yield every date in the given year."""
d = date(year, 1, 1)
end = date(year + 1, 1, 1)
while d < end:
yield d
d += timedelta(days=1)
def download_station_year(station: str, year: int, session: requests.Session) -> int:
"""Download all daily files for one station+year. Returns count of new files."""
new = 0
for d in iter_dates(year):
url = build_url(station, year, d.month, d.day)
dest = out_path(station, year, d.month, d.day)
if download_file(url, dest, session):
new += 1
log.debug(" + %s", dest.name)
time.sleep(0.05) # polite crawl — 20 req/s max
return new
def main():
ap = argparse.ArgumentParser(description="Download washetdonker.nl SQM data")
ap.add_argument("--station", help="Single station name (e.g. Texel)")
ap.add_argument("--year", type=int, help="Single year (e.g. 2023)")
ap.add_argument("--all", dest="all_years", action="store_true",
help="Download all years (2020-2026) for all stations")
ap.add_argument("--sample", action="store_true",
help="Download 2024 for all stations (quick sample)")
ap.add_argument("--since", type=int, default=2022,
help="Download from this year onwards (default: 2022)")
args = ap.parse_args()
stations = [args.station] if args.station else ALL_STATIONS
if args.year:
years = [args.year]
elif args.sample:
years = [2024]
elif args.all_years:
years = list(range(2020, 2027))
else:
years = list(range(args.since, 2027))
log.info("Stations: %d Years: %s", len(stations), years)
session = requests.Session()
session.headers["User-Agent"] = (
"pray-calc-ml/1.0 (academic research; twilight data; "
"contact: alisalaah@gmail.com)"
)
total_new = 0
for station in stations:
for year in years:
log.info("Downloading %s / %d ...", station, year)
new = download_station_year(station, year, session)
total_new += new
if new:
log.info(" -> %d new files", new)
log.info("Done. Total new files: %d", total_new)
log.info("Output: %s", OUT_ROOT)
if __name__ == "__main__":
main()