Add support for different timezones

Question

Add support for different timezones

Closed this issue 5 months ago · 1 comments

ddlpy version: main
Python version: 3.11
Operating System: Windows

Description

When supplying a start_date/end_date, only UTC is supported. They are converted to the local timezone which is then fed into ddl. The timezones are consistent, date_start="2022-01-01" gives t0=2022-01-01 01:00:00+01:00. Consider to add support for different timezones, and use the what you give is what you get principle (timezone-wise). In that case, we also need to check if the timezones from date_start and date_end are equal.

Todo:

Rijkswaterstaat/wm-ws-dl#9 >> not necessary to wait for this one
add support for timezones >> only support input timezones, as output timezones is not straightforward for ddlpy.measurements_latest() and ddlpy.measurements_amount(). This also makes sure that the behaviour does not change with respect to older ddlpy versions.
not only for ddlpy.measurements(), also for other functions if needed
add testcase with time stings, and one with time zones
assert time start/stop in tests (incl timezone)
check/improve long code below

What I Did

When adding timezones, we get "ValueError: Not naive datetime (tzinfo is already set)":

import ddlpy # TODO: we require ddlpy from main/master branch (>0.1.0) >> pip install git+https://github.com/openearth/ddlpy
import pandas as pd

# input parameters
start_date  = pd.Timestamp(2019,9,1, tz="+01:00")
end_date = pd.Timestamp(2019,9,3, tz="+01:00")

locations = ddlpy.locations()
bool_hoedanigheid = locations['Hoedanigheid.Code'].isin(['NAP'])
bool_stations = locations.index.isin(['HARVT10'])
bool_grootheid = locations['Grootheid.Code'].isin(['WATHTE'])
locs_wathte = locations.loc[bool_grootheid & bool_hoedanigheid & bool_stations]

# retrieve with ddlpy
meas_wathte = ddlpy.measurements(locs_wathte.iloc[0], start_date=start_date, end_date=end_date)

Validation for timezones (independent of input tz, output tz is always MET, but this is accounted for in hatyan ddl code). It also succeeds when adjusting tz_ddl. This is a useful testcase however:

import pandas as pd
import requests

date_start = "2009-01-01"
date_end = "2009-01-02"

tz_ddl = "+00:00"
url_ddl = 'https://waterwebservices.rijkswaterstaat.nl/ONLINEWAARNEMINGENSERVICES_DBO/OphalenWaarnemingen'
request_ddl = {'AquoPlusWaarnemingMetadata': 
               {'AquoMetadata': 
                {'Grootheid': {'Code': 'WATHTE'}, 'Groepering': {'Code': 'NVT'}, 
                 'Hoedanigheid': {'Code': 'NAP'},
                 # 'MeetApparaat': {'Code': '127'}
                 }
                }, 
                # 'Locatie': {'Locatie_MessageID': 10716, 'X': 571389.152745295, 'Y': 5694632.62008149, 'Naam': 'Walsoorden', 'Code': 'WALSODN'}, 
                'Locatie': {'Locatie_MessageID': 10399, 'X': 576917.669784491, 'Y': 5759136.15818497, 'Naam': 'Hoek van Holland', 'Code': 'HOEKVHLD'},
                'Periode': {'Begindatumtijd': f'{date_start}T00:00:00.000{tz_ddl}', 'Einddatumtijd': f'{date_end}T00:00:00.000{tz_ddl}'}}
resp = requests.post(url_ddl, json=request_ddl)
if not resp.ok:
    raise Exception('%s for %s: %s'%(resp.reason, resp.url, str(resp.text)))
result = resp.json()
if not result['Succesvol']:
    raise Exception('query not succesful, Foutmelding: %s from %s'%(result['Foutmelding'],url_ddl))
result_pd = pd.json_normalize(result['WaarnemingenLijst'][0]["MetingenLijst"])
print(result_pd[["Tijdstip","Meetwaarde.Waarde_Numeriek"]]) # 3 duplicate times for WALSODN 2010-01-01 etc
result_idx = result_pd[["Tijdstip","Meetwaarde.Waarde_Numeriek"]].set_index("Tijdstip")
result_idx.index = pd.DatetimeIndex(result_idx.index)

# load dfmt ddl dataset
meta_dict = {'Grootheid.Code':'WATHTE', 'Groepering.Code':'NVT', #combination for measured waterlevels
             'Hoedanigheid.Code':'NAP', # vertical reference. Hoedanigheid is necessary for eg EURPFM/LICHTELGRE, where NAP and MSL values are available while it should only contain MSL #MSL, NAP, PLAATSLR, TAW, NVT (from cat_aquometadatalijst_waterhoogte['Hoedanigheid.Code'])
             # 'MeetApparaat.Code':'127', # measurement device type. MeetApparaat.Code is necessary for IJMDBTHVN/ROOMPBTN/LICHTELGRE, where also radar measurements are available (all other stations are vlotter and these stations also have all important data in vlotter) TODO: Except LICHTELGRE/K13APFM which have Radar/Stappenbaak en Radar as MeetApparaat
             }
import dfm_tools as dfmt
import xarray as xr
ddl_slev_gdf = dfmt.ssh_catalog_subset(source="rwsddl", meta_dict=meta_dict)
dir_output = "."
bool_hoek = ddl_slev_gdf["Code"] == "HOEKVHLD"
ddl_slev_gdf_sel = ddl_slev_gdf.loc[bool_hoek]
dfmt.ssh_retrieve_data(ssh_catalog_gpd=ddl_slev_gdf_sel, dir_output=dir_output, time_min=date_start, time_max=date_end, meta_dict=meta_dict)
ds = xr.open_dataset("HOEKVHLD.nc")
ts_dfmt_gmt = ds[["waterlevel"]].to_pandas()
ts_dfmt_gmt.index = ts_dfmt_gmt.index.tz_localize("UTC+00:00")
ts_dfmt_gmt.index = ts_dfmt_gmt.index.tz_convert("UTC+01:00")
ts_dfmt_gmt["waterlevel"] *= 100
del ds

# load ddlpy dataset
import ddlpy
locations = ddlpy.locations()
bool_groepering_ts = locations["Groepering.Code"].isin(["NVT"])
bool_stations = locations.index.isin(['HOEKVHLD'])
bool_grootheid = locations['Grootheid.Code'].isin(['WATHTE'])
selected_ts = locations.loc[bool_groepering_ts & bool_stations & bool_grootheid]
selected_ext = locations.loc[~bool_groepering_ts & bool_stations & bool_grootheid]
# if we pass one row to the measurements function you can get all the measurements
measurements_ts = ddlpy.measurements(selected_ts.iloc[0], date_start, date_end)
measurements_ext = ddlpy.measurements(selected_ext.iloc[0], date_start, date_end)

ts_ddlpy = measurements_ts[['Meetwaarde.Waarde_Numeriek']]
ts_ddlpy_ext = measurements_ext[['Meetwaarde.Waarde_Numeriek']]

# load validation dataset (MET tz)
import hatyan
file_dia = r"c:\DATA\hatyan_data_acceptancetests\predictie2019\HOEKVHLD_obs1.txt"
ts = hatyan.readts_dia(file_dia)
ts = ts.loc[slice(None, "2009-01-01")]
ts.index = ts.index.tz_localize("UTC+01:00")
ts["values"] *= 100

# compare sources
import matplotlib.pyplot as plt
plt.close("all")
fig, ax = plt.subplots()
ax.plot(result_idx["Meetwaarde.Waarde_Numeriek"], label="waterwebservices")
ax.plot(ts_dfmt_gmt["waterlevel"], "--", label="dfmt ddl (ddlpy)")
ax.plot(ts_ddlpy["Meetwaarde.Waarde_Numeriek"], "--", label="ddlpy")
ax.plot(ts_ddlpy_ext["Meetwaarde.Waarde_Numeriek"], "xk", label="ddlpy ext")
ax.plot(ts["values"], "x:", label="validation")
ax.legend(loc=1)

Gives:

Get the timezone from a string with pd.Timestamp to add to df.index:

pd.Timestamp("2020-03-22 00:00:00 +01:00").tz
Out[65]: datetime.timezone(datetime.timedelta(seconds=3600))

Answer 1 · 2024-04-26T09:53:55.000Z

After implementation of this issue and some cleanups in the example code we get a better comparison figure:

import pandas as pd
import requests

date_start = "2009-01-01 00:00:00 +01:00"
date_end = "2009-01-02 01:00:00 +01:00"
date_start_str = pd.Timestamp(date_start).isoformat(timespec='milliseconds')
date_end_str = pd.Timestamp(date_end).isoformat(timespec='milliseconds')

url_ddl = 'https://waterwebservices.rijkswaterstaat.nl/ONLINEWAARNEMINGENSERVICES_DBO/OphalenWaarnemingen'
request_ddl = {'AquoPlusWaarnemingMetadata': 
               {'AquoMetadata': 
                {'Grootheid': {'Code': 'WATHTE'}, 'Groepering': {'Code': 'NVT'}, 
                 'Hoedanigheid': {'Code': 'NAP'},
                 # 'MeetApparaat': {'Code': '127'}
                 }
                }, 
                # 'Locatie': {'Locatie_MessageID': 10716, 'X': 571389.152745295, 'Y': 5694632.62008149, 'Naam': 'Walsoorden', 'Code': 'WALSODN'}, 
                'Locatie': {'Locatie_MessageID': 10399, 'X': 576917.669784491, 'Y': 5759136.15818497, 'Naam': 'Hoek van Holland', 'Code': 'HOEKVHLD'},
                'Periode': {'Begindatumtijd': date_start_str, 'Einddatumtijd': date_end_str}}
resp = requests.post(url_ddl, json=request_ddl)
if not resp.ok:
    raise Exception('%s for %s: %s'%(resp.reason, resp.url, str(resp.text)))
result = resp.json()
if not result['Succesvol']:
    raise Exception('query not succesful, Foutmelding: %s from %s'%(result['Foutmelding'],url_ddl))
result_pd = pd.json_normalize(result['WaarnemingenLijst'][0]["MetingenLijst"])
print(result_pd[["Tijdstip","Meetwaarde.Waarde_Numeriek"]]) # 3 duplicate times for WALSODN 2010-01-01 etc
result_idx = result_pd[["Tijdstip","Meetwaarde.Waarde_Numeriek"]].set_index("Tijdstip")
result_idx.index = pd.DatetimeIndex(result_idx.index)

# load dfmt ddl dataset
meta_dict = {'Grootheid.Code':'WATHTE', 'Groepering.Code':'NVT', #combination for measured waterlevels
             'Hoedanigheid.Code':'NAP', # vertical reference. Hoedanigheid is necessary for eg EURPFM/LICHTELGRE, where NAP and MSL values are available while it should only contain MSL #MSL, NAP, PLAATSLR, TAW, NVT (from cat_aquometadatalijst_waterhoogte['Hoedanigheid.Code'])
             # 'MeetApparaat.Code':'127', # measurement device type. MeetApparaat.Code is necessary for IJMDBTHVN/ROOMPBTN/LICHTELGRE, where also radar measurements are available (all other stations are vlotter and these stations also have all important data in vlotter) TODO: Except LICHTELGRE/K13APFM which have Radar/Stappenbaak en Radar as MeetApparaat
             }
import dfm_tools as dfmt
import xarray as xr
ddl_slev_gdf = dfmt.ssh_catalog_subset(source="rwsddl", meta_dict=meta_dict)
dir_output = "."
bool_hoek = ddl_slev_gdf["Code"] == "HOEKVHLD"
ddl_slev_gdf_sel = ddl_slev_gdf.loc[bool_hoek]
dfmt.ssh_retrieve_data(ssh_catalog_gpd=ddl_slev_gdf_sel, dir_output=dir_output, time_min=date_start, time_max=date_end)
ds = xr.open_dataset("HOEKVHLD.nc")
ts_dfmt_gmt = ds[["waterlevel"]].to_pandas()
ts_dfmt_gmt.index = ts_dfmt_gmt.index.tz_localize("UTC+00:00")
ts_dfmt_gmt.index = ts_dfmt_gmt.index.tz_convert("UTC+01:00")
ts_dfmt_gmt["waterlevel"] *= 100
del ds

# load ddlpy dataset
import ddlpy
locations = ddlpy.locations()
bool_groepering_ts = locations["Groepering.Code"].isin(["NVT"])
bool_stations = locations.index.isin(['HOEKVHLD'])
bool_grootheid = locations['Grootheid.Code'].isin(['WATHTE'])
selected_ts = locations.loc[bool_groepering_ts & bool_stations & bool_grootheid]
selected_ext = locations.loc[~bool_groepering_ts & bool_stations & bool_grootheid]
# if we pass one row to the measurements function you can get all the measurements
measurements_ts = ddlpy.measurements(selected_ts.iloc[0], date_start, date_end)
measurements_ext = ddlpy.measurements(selected_ext.iloc[0], date_start, date_end)

ts_ddlpy = measurements_ts[['Meetwaarde.Waarde_Numeriek']]
ts_ddlpy_ext = measurements_ext[['Meetwaarde.Waarde_Numeriek']]

# load validation dataset (MET tz)
import hatyan
file_dia = r"c:\DATA\hatyan_data_acceptancetests\predictie2019\HOEKVHLD_obs1.txt"
ts = hatyan.readts_dia(file_dia)
ts.index = ts.index.tz_localize("UTC+01:00")
ts = ts.loc[slice(date_start, date_end)]
ts["values"] *= 100

# compare sources
import matplotlib.pyplot as plt
plt.close("all")
fig, ax = plt.subplots(figsize=(10,5))
ax.plot(result_idx["Meetwaarde.Waarde_Numeriek"], label="waterwebservices")
ax.plot(ts_dfmt_gmt["waterlevel"], "--", label="dfmt ddl (ddlpy)")
ax.plot(ts_ddlpy["Meetwaarde.Waarde_Numeriek"], "--", label="ddlpy")
ax.plot(ts_ddlpy_ext["Meetwaarde.Waarde_Numeriek"], "xk", label="ddlpy ext")
ax.plot(ts["values"], "x:", label="validation diafile")
ax.legend(loc=1)
ax.grid()
fig.tight_layout()

The comparison figure shows that the time extents of all sources (ddlpy, ddl, dfm_tools and diafile) are perfectly aligned: