mirror of
https://github.com/acamarata/pray-calc-ml.git
synced 2026-06-30 19:04:26 +00:00
data: update pipeline + dataset to latest collected records
- Regenerate fajr_angles.csv with current collection state - Update wiki docs to reflect current dataset stats - Add missing requirements and minor pipeline fixes
This commit is contained in:
parent
d8471f8ca5
commit
6abc976bb9
15 changed files with 1747 additions and 1736 deletions
49
.github/wiki/Data-Collection.md
vendored
49
.github/wiki/Data-Collection.md
vendored
|
|
@ -44,15 +44,17 @@ This does three things in sequence:
|
||||||
|
|
||||||
1. **Fetches the OpenFajr iCal feed** from `calendar.google.com` — ~4,018 community-verified
|
1. **Fetches the OpenFajr iCal feed** from `calendar.google.com` — ~4,018 community-verified
|
||||||
Fajr records from Birmingham, UK, 2016-2026. Requires network access.
|
Fajr records from Birmingham, UK, 2016-2026. Requires network access.
|
||||||
2. **Loads manually compiled records** from `src/collect/verified_sightings.py` — ~141 records
|
2. **Loads manually compiled records** from `src/collect/verified_sightings.py` and per-source
|
||||||
from peer-reviewed studies across 35 locations worldwide.
|
CSVs in `data/raw/raw_sightings/`.
|
||||||
3. **Looks up missing elevations** via the [Open-Elevation API](https://open-elevation.com) for
|
3. **Loads pre-computed SQM angles** from `src/collect/precomputed_angles.py` (1,621 Basthoni
|
||||||
any record where `elevation_m == 0`.
|
2022 records where depression angles were measured directly by instrument).
|
||||||
|
4. **Looks up missing elevations** via the Open-Topo-Data API (with Open-Elevation fallback)
|
||||||
|
for any record where `elevation_m == 0`.
|
||||||
|
|
||||||
Output:
|
Output:
|
||||||
```
|
```
|
||||||
data/processed/fajr_angles.csv — ~4,105 Fajr records
|
data/processed/fajr_angles.csv — 5,871 Fajr records
|
||||||
data/processed/isha_angles.csv — ~43 Isha records
|
data/processed/isha_angles.csv — 46 Isha records
|
||||||
```
|
```
|
||||||
|
|
||||||
### Without elevation lookup
|
### Without elevation lookup
|
||||||
|
|
@ -72,14 +74,17 @@ Skips the Open-Elevation API calls. Use this when:
|
||||||
Loading OpenFajr Birmingham iCal feed...
|
Loading OpenFajr Birmingham iCal feed...
|
||||||
4018 Fajr records from OpenFajr
|
4018 Fajr records from OpenFajr
|
||||||
Loading manually verified sightings...
|
Loading manually verified sightings...
|
||||||
141 manually compiled records
|
... genuine manually compiled records (after quality filter)
|
||||||
|
Loading ingested raw CSV sightings...
|
||||||
|
... records from raw CSVs
|
||||||
|
Loading pre-computed angle records (SQM instrument data)...
|
||||||
|
1621 pre-computed angle records
|
||||||
Computing solar depression angles...
|
Computing solar depression angles...
|
||||||
Dropping 11 record(s) with implausible angles (< 7.0° Fajr / < 10.0° Isha):
|
Dropping N record(s) with implausible angles (< 7.0° Fajr / < 10.0° Isha):
|
||||||
FAJR 2021-03-27 ... angle=-18.71° — OpenFajr (openfajr.org)
|
|
||||||
...
|
...
|
||||||
|
|
||||||
Fajr dataset: 4105 records → data/processed/fajr_angles.csv
|
Fajr dataset: 5871 records → data/processed/fajr_angles.csv
|
||||||
Isha dataset: 43 records → data/processed/isha_angles.csv
|
Isha dataset: 46 records → data/processed/isha_angles.csv
|
||||||
```
|
```
|
||||||
|
|
||||||
Records dropped with "implausible angles" are data entry or DST-transition artifacts. The
|
Records dropped with "implausible angles" are data entry or DST-transition artifacts. The
|
||||||
|
|
@ -102,15 +107,23 @@ true dawn. The voted times are published as a public Google Calendar iCal feed.
|
||||||
- Fetched live by the pipeline — no local cache needed
|
- Fetched live by the pipeline — no local cache needed
|
||||||
|
|
||||||
This is the highest-quality source: actual community-reviewed per-date timestamps at a single
|
This is the highest-quality source: actual community-reviewed per-date timestamps at a single
|
||||||
well-documented location. It provides 98% of the Fajr training data.
|
well-documented location. It provides ~68% of the Fajr training data.
|
||||||
|
|
||||||
### Secondary: Manually compiled records
|
### Secondary: Basthoni 2022 SQM network (Indonesia)
|
||||||
|
|
||||||
Located in `src/collect/verified_sightings.py`. These come from:
|
1,621 per-night SQM records across 46 Indonesian sites, extracted from Basthoni's 2022 PhD
|
||||||
|
dissertation at UIN Walisongo. Each record is a direct instrument measurement where the Fajr
|
||||||
|
depression angle was determined by linear fitting of SQM time-series data. Loaded by
|
||||||
|
`src/collect/precomputed_angles.py`.
|
||||||
|
|
||||||
- Peer-reviewed academic papers (NRIAG Egypt, Malaysia, Indonesia, Saudi Arabia)
|
### Tertiary: Manually compiled records
|
||||||
- Community observation programs (Hizbul Ulama UK, Asim Yusuf UK, Moonsighting.com)
|
|
||||||
- National religious body publications (AFIC Australia, Jordanian Awqaf, etc.)
|
Located in `src/collect/verified_sightings.py` and per-source CSVs in `data/raw/raw_sightings/`.
|
||||||
|
These come from:
|
||||||
|
|
||||||
|
- Peer-reviewed academic papers (NRIAG Egypt, Malaysia, Indonesia, Saudi Arabia, Mauritania)
|
||||||
|
- Community observation programs (Miftahi/Shaukat UK, Asim Yusuf UK, Moonsighting.com)
|
||||||
|
- Institutional SQM data (BRIN Mount Timau, BRIN multistation network)
|
||||||
|
|
||||||
See [Data Sources](Data-Sources) for the full citation table.
|
See [Data Sources](Data-Sources) for the full citation table.
|
||||||
|
|
||||||
|
|
@ -179,7 +192,7 @@ python -m src.pipeline --no-elevation-lookup 2>&1 | grep -A5 "Dropping"
|
||||||
|
|
||||||
## Priority gaps to fill
|
## Priority gaps to fill
|
||||||
|
|
||||||
The Isha dataset is the most critical gap at ~43 records. Fajr has excellent Birmingham coverage
|
The Isha dataset is the most critical gap at 46 records. Fajr has excellent Birmingham coverage
|
||||||
but needs more geographic diversity:
|
but needs more geographic diversity:
|
||||||
|
|
||||||
| Gap | What to look for |
|
| Gap | What to look for |
|
||||||
|
|
|
||||||
2
.github/wiki/Data.md
vendored
2
.github/wiki/Data.md
vendored
|
|
@ -126,4 +126,4 @@ CSV format for raw_sightings: `prayer, date_local, time_local, utc_offset, lat,
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
*[← Data Sources](Data-Sources) . [Research -->](Research) . [Home](Home)*
|
*[← Data Sources](Data-Sources) · [Research →](Research) · [Home](Home)*
|
||||||
|
|
|
||||||
4
.github/wiki/Home.md
vendored
4
.github/wiki/Home.md
vendored
|
|
@ -57,3 +57,7 @@ virtually all well-documented sites.
|
||||||
---
|
---
|
||||||
|
|
||||||
*Part of the [acamarata](https://github.com/acamarata) Islamic computing library suite.*
|
*Part of the [acamarata](https://github.com/acamarata) Islamic computing library suite.*
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
*[Data Collection →](Data-Collection)*
|
||||||
|
|
|
||||||
2
.github/wiki/Research.md
vendored
2
.github/wiki/Research.md
vendored
|
|
@ -267,4 +267,4 @@ Best strategies for expanding high-quality records:
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
*[<-- Data](Data) . [Research Notes -->](Research-Notes) . [Home](Home)*
|
*[← Data](Data) · [Research Notes →](Research-Notes) · [Home](Home)*
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load diff
|
|
@ -5,3 +5,8 @@ requests>=2.31
|
||||||
matplotlib>=3.7
|
matplotlib>=3.7
|
||||||
scikit-learn>=1.3
|
scikit-learn>=1.3
|
||||||
jupyter>=1.0
|
jupyter>=1.0
|
||||||
|
beautifulsoup4>=4.12
|
||||||
|
lxml>=5.0
|
||||||
|
pdfminer.six>=20231228
|
||||||
|
PyMuPDF>=1.24
|
||||||
|
duckduckgo-search>=6.0
|
||||||
|
|
|
||||||
|
|
@ -9,7 +9,7 @@ because human observers see the sky with refraction — the angle we compute
|
||||||
matches what the sun physically was doing at that horizon.
|
matches what the sun physically was doing at that horizon.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime
|
||||||
import ephem
|
import ephem
|
||||||
import math
|
import math
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -27,15 +27,10 @@ Reference: Damanhuri & Mukarram (2022), LAPAN SQM multi-station Indonesia.
|
||||||
Mean D0 reported: -16.51° (all stations, quality-filtered).
|
Mean D0 reported: -16.51° (all stations, quality-filtered).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import sys
|
|
||||||
import math
|
|
||||||
import os
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from datetime import datetime, timezone, timedelta
|
from datetime import datetime, timezone, timedelta
|
||||||
from collections import defaultdict
|
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
|
|
||||||
# Station metadata: code -> (lat, lon, elevation_m, name, utc_offset)
|
# Station metadata: code -> (lat, lon, elevation_m, name, utc_offset)
|
||||||
|
|
|
||||||
|
|
@ -27,7 +27,6 @@ from io import StringIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
@ -319,7 +318,6 @@ def download_and_extract_all(output_dir: Path) -> list[dict]:
|
||||||
Caches downloaded files to output_dir/brin_multistation_raw/.
|
Caches downloaded files to output_dir/brin_multistation_raw/.
|
||||||
"""
|
"""
|
||||||
from urllib.request import urlopen, Request
|
from urllib.request import urlopen, Request
|
||||||
from urllib.error import URLError
|
|
||||||
|
|
||||||
# File ID → filename mapping from BRIN Dataverse API
|
# File ID → filename mapping from BRIN Dataverse API
|
||||||
FILE_IDS: dict[str, int] = {
|
FILE_IDS: dict[str, int] = {
|
||||||
|
|
|
||||||
|
|
@ -27,7 +27,6 @@ from datetime import datetime, timedelta
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
|
||||||
|
|
@ -10,7 +10,6 @@ Source: https://openfajr.org/
|
||||||
Location: Birmingham, UK — 52.4862°N, 1.8904°W, elevation 141 m
|
Location: Birmingham, UK — 52.4862°N, 1.8904°W, elevation 141 m
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import io
|
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
|
||||||
|
|
@ -22,7 +22,6 @@ may already be known for the site.
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import io
|
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
|
|
|
||||||
|
|
@ -15,7 +15,7 @@ Each record has: prayer, date, lat, lng, elevation_m, angle, source, notes.
|
||||||
The pipeline merges these after its own angle computation step.
|
The pipeline merges these after its own angle computation step.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from datetime import datetime
|
from datetime import datetime, timedelta, timezone
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1961,7 +1961,7 @@ def load_precomputed_angles() -> pd.DataFrame:
|
||||||
# This is only used for day_of_year; the angle is pre-computed.
|
# This is only used for day_of_year; the angle is pre-computed.
|
||||||
local_dt = datetime.strptime(f"{date_iso} 04:00", "%Y-%m-%d %H:%M")
|
local_dt = datetime.strptime(f"{date_iso} 04:00", "%Y-%m-%d %H:%M")
|
||||||
utc_dt = (local_dt - timedelta(hours=utc_offset)).replace(
|
utc_dt = (local_dt - timedelta(hours=utc_offset)).replace(
|
||||||
tzinfo=None
|
tzinfo=timezone.utc
|
||||||
)
|
)
|
||||||
|
|
||||||
weather_en = {
|
weather_en = {
|
||||||
|
|
|
||||||
|
|
@ -19,7 +19,6 @@ from __future__ import annotations
|
||||||
|
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
import time
|
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Literal
|
from typing import Literal
|
||||||
|
|
|
||||||
|
|
@ -29,9 +29,7 @@ Usage:
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import sys
|
import sys
|
||||||
import os
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from datetime import timezone
|
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
|
|
@ -52,7 +50,7 @@ PROCESSED_DIR = ROOT / "data" / "processed"
|
||||||
|
|
||||||
def _raw_to_df(records: list[dict]) -> pd.DataFrame:
|
def _raw_to_df(records: list[dict]) -> pd.DataFrame:
|
||||||
"""Convert a list of standardized raw record dicts to a DataFrame."""
|
"""Convert a list of standardized raw record dicts to a DataFrame."""
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta, timezone
|
||||||
rows = []
|
rows = []
|
||||||
for r in records:
|
for r in records:
|
||||||
try:
|
try:
|
||||||
|
|
@ -60,7 +58,9 @@ def _raw_to_df(records: list[dict]) -> pd.DataFrame:
|
||||||
f"{r['date_local']} {r['time_local']}", "%Y-%m-%d %H:%M"
|
f"{r['date_local']} {r['time_local']}", "%Y-%m-%d %H:%M"
|
||||||
)
|
)
|
||||||
utc_offset = float(r.get("utc_offset", 0))
|
utc_offset = float(r.get("utc_offset", 0))
|
||||||
utc_dt = dt_local - timedelta(hours=utc_offset)
|
utc_dt = (dt_local - timedelta(hours=utc_offset)).replace(
|
||||||
|
tzinfo=timezone.utc
|
||||||
|
)
|
||||||
rows.append({
|
rows.append({
|
||||||
"prayer": r["prayer"],
|
"prayer": r["prayer"],
|
||||||
"date": r["date_local"],
|
"date": r["date_local"],
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue