pray-calc-ml/tests/test_sightings_clean.py

223 lines
8.4 KiB
Python

"""
Tests for src/collect/data/sightings_clean.py
Verifies filter_non_genuine, deduplicate_sightings, and apply_quality_filters
against known bad/good records and edge cases.
"""
import sys
sys.path.insert(0, str(__import__("pathlib").Path(__file__).parent.parent))
import pandas as pd
import pytest
from datetime import datetime, timezone, timedelta
from src.collect.data.sightings_clean import (
filter_non_genuine,
deduplicate_sightings,
apply_quality_filters,
BAD_NOTE_MARKERS,
FAJR_MIN_DEG,
ISHA_MIN_DEG,
MAX_DEG,
)
def _make_row(prayer="fajr", date="2023-03-20", lat=40.0, lng=29.0,
elevation_m=100.0, source="Test source", notes="naked eye"):
"""Build a minimal valid sighting row dict."""
utc_dt = datetime(2023, 3, 20, 4, 0, 0, tzinfo=timezone.utc)
return {
"prayer": prayer,
"date": date,
"utc_dt": utc_dt,
"lat": lat,
"lng": lng,
"elevation_m": elevation_m,
"source": source,
"notes": notes,
}
class TestFilterNonGenuine:
def test_genuine_record_is_kept(self):
"""A record with no bad markers should pass through unchanged."""
df = pd.DataFrame([_make_row(notes="naked eye; per-night obs")])
result = filter_non_genuine(df)
assert len(result) == 1
def test_bad_notes_marker_drops_row(self):
"""A record with 'time inferred' in notes must be dropped."""
df = pd.DataFrame([
_make_row(notes="time inferred from D0=14.5°"),
_make_row(notes="naked eye observation"),
])
result = filter_non_genuine(df)
assert len(result) == 1
assert "naked eye observation" in result.iloc[0]["notes"]
def test_bad_source_marker_drops_row(self):
"""A record matching a bad marker in source must be dropped."""
df = pd.DataFrame([
_make_row(source="Umm al-Qura standard 18° Fajr, Makkah"),
_make_row(source="Genuine researcher 2023"),
])
result = filter_non_genuine(df)
assert len(result) == 1
def test_shafaq_ahmar_dropped(self):
"""Records with Shafaq Ahmar criterion must be dropped."""
df = pd.DataFrame([
_make_row(notes="Shafaq Ahmar (red dusk) observed"),
_make_row(notes="Shafaq al-Abyad (white dusk)"),
])
result = filter_non_genuine(df)
assert len(result) == 1
assert "al-Abyad" in result.iloc[0]["notes"]
def test_case_insensitive_match(self):
"""Bad marker matching is case-insensitive."""
df = pd.DataFrame([
_make_row(notes="AGGREGATE REPRESENTATIVE date; 4 records"),
])
result = filter_non_genuine(df)
assert len(result) == 0
def test_empty_dataframe(self):
"""Empty DataFrame returns empty DataFrame without error."""
df = pd.DataFrame(columns=["prayer", "date", "lat", "lng", "notes", "source"])
result = filter_non_genuine(df)
assert len(result) == 0
def test_bad_note_markers_is_nonempty(self):
"""BAD_NOTE_MARKERS must have at least 10 entries."""
assert len(BAD_NOTE_MARKERS) >= 10
class TestDeduplicateSightings:
def test_no_duplicates_unchanged(self):
"""DataFrame with no duplicates is returned unchanged."""
rows = [
_make_row(lat=40.0, lng=29.0, date="2023-01-01"),
_make_row(lat=51.5, lng=-0.1, date="2023-01-01"),
]
df = pd.DataFrame(rows)
result = deduplicate_sightings(df)
assert len(result) == 2
def test_exact_duplicate_removed(self):
"""Two identical (prayer, date, lat, lng) rows: keep first, drop second."""
rows = [
_make_row(lat=40.0, lng=29.0, date="2023-06-15", source="Source A"),
_make_row(lat=40.0, lng=29.0, date="2023-06-15", source="Source B"),
]
df = pd.DataFrame(rows)
result = deduplicate_sightings(df)
assert len(result) == 1
assert result.iloc[0]["source"] == "Source A"
def test_near_duplicate_within_111m_removed(self):
"""Two records rounded to same 3dp lat/lng are duplicates."""
rows = [
_make_row(lat=40.001, lng=29.001, date="2023-06-15", source="Source A"),
_make_row(lat=40.0009, lng=29.0009, date="2023-06-15", source="Source B"),
]
df = pd.DataFrame(rows)
result = deduplicate_sightings(df)
assert len(result) == 1
def test_same_location_different_prayer_both_kept(self):
"""Same lat/lng but different prayer type: both kept."""
rows = [
_make_row(prayer="fajr", lat=40.0, lng=29.0, date="2023-06-15"),
_make_row(prayer="isha", lat=40.0, lng=29.0, date="2023-06-15"),
]
df = pd.DataFrame(rows)
result = deduplicate_sightings(df)
assert len(result) == 2
def test_same_location_different_date_both_kept(self):
"""Same lat/lng and prayer but different date: both kept."""
rows = [
_make_row(lat=40.0, lng=29.0, date="2023-06-15"),
_make_row(lat=40.0, lng=29.0, date="2023-06-16"),
]
df = pd.DataFrame(rows)
result = deduplicate_sightings(df)
assert len(result) == 2
def test_no_lat_lng_temp_columns_in_output(self):
"""Deduplication must not leave _lat_r or _lng_r temp columns in output."""
df = pd.DataFrame([_make_row()])
result = deduplicate_sightings(df)
assert "_lat_r" not in result.columns
assert "_lng_r" not in result.columns
class TestApplyQualityFilters:
def _df_with_angle(self, prayer="fajr", angle=14.0, lat=40.0,
date="2023-01-15", lng=29.0):
utc_dt = datetime(2023, 1, 15, 4, 0, tzinfo=timezone.utc)
return pd.DataFrame([{
"prayer": prayer, "date": date, "utc_dt": utc_dt,
"lat": lat, "lng": lng, "elevation_m": 100.0,
"angle": angle, "source": "test", "notes": "test",
}])
def test_valid_fajr_angle_kept(self):
df = self._df_with_angle(prayer="fajr", angle=14.0)
assert len(apply_quality_filters(df)) == 1
def test_valid_isha_angle_kept(self):
df = self._df_with_angle(prayer="isha", angle=17.5)
assert len(apply_quality_filters(df)) == 1
def test_fajr_below_min_dropped(self):
"""Fajr angle below FAJR_MIN_DEG is dropped."""
df = self._df_with_angle(prayer="fajr", angle=FAJR_MIN_DEG - 0.1)
assert len(apply_quality_filters(df)) == 0
def test_isha_below_min_dropped(self):
"""Isha angle below ISHA_MIN_DEG is dropped."""
df = self._df_with_angle(prayer="isha", angle=ISHA_MIN_DEG - 0.1)
assert len(apply_quality_filters(df)) == 0
def test_angle_above_max_dropped(self):
"""Angle above MAX_DEG is dropped for both prayers."""
df_fajr = self._df_with_angle(prayer="fajr", angle=MAX_DEG + 0.1)
df_isha = self._df_with_angle(prayer="isha", angle=MAX_DEG + 0.1)
assert len(apply_quality_filters(df_fajr)) == 0
assert len(apply_quality_filters(df_isha)) == 0
def test_polar_lat_dropped(self):
"""Records with |lat| > 70 are dropped."""
df = self._df_with_angle(lat=71.0)
assert len(apply_quality_filters(df)) == 0
def test_null_island_dropped(self):
"""Records at lat~0, lng~0 are dropped."""
utc_dt = datetime(2023, 1, 15, 4, 0, tzinfo=timezone.utc)
df = pd.DataFrame([{
"prayer": "fajr", "date": "2023-01-15", "utc_dt": utc_dt,
"lat": 0.0, "lng": 0.0, "elevation_m": 0.0,
"angle": 14.0, "source": "test", "notes": "test",
}])
assert len(apply_quality_filters(df)) == 0
def test_future_date_dropped(self):
"""Records with dates in the future are dropped."""
from datetime import date
future = str(date(date.today().year + 1, 1, 1))
utc_dt = datetime(date.today().year + 1, 1, 1, 4, 0, tzinfo=timezone.utc)
df = pd.DataFrame([{
"prayer": "fajr", "date": future, "utc_dt": utc_dt,
"lat": 40.0, "lng": 29.0, "elevation_m": 100.0,
"angle": 14.0, "source": "test", "notes": "test",
}])
assert len(apply_quality_filters(df)) == 0
def test_nan_angle_dropped(self):
"""Records with NaN angle are dropped."""
import math
df = self._df_with_angle(angle=float("nan"))
assert len(apply_quality_filters(df)) == 0