Add support for different timezones
Closed this issue · 1 comments
- ddlpy version: main
- Python version: 3.11
- Operating System: Windows
Description
When supplying a start_date/end_date, only UTC is supported. They are converted to the local timezone which is then fed into ddl. The timezones are consistent, date_start="2022-01-01"
gives t0=2022-01-01 01:00:00+01:00
. Consider to add support for different timezones, and use the what you give is what you get principle (timezone-wise). In that case, we also need to check if the timezones from date_start
and date_end
are equal.
Todo:
- Rijkswaterstaat/wm-ws-dl#9 >> not necessary to wait for this one
- add support for timezones >> only support input timezones, as output timezones is not straightforward for
ddlpy.measurements_latest()
andddlpy.measurements_amount()
. This also makes sure that the behaviour does not change with respect to older ddlpy versions. - not only for
ddlpy.measurements()
, also for other functions if needed - add testcase with time stings, and one with time zones
- assert time start/stop in tests (incl timezone)
- check/improve long code below
What I Did
When adding timezones, we get "ValueError: Not naive datetime (tzinfo is already set)"
:
import ddlpy # TODO: we require ddlpy from main/master branch (>0.1.0) >> pip install git+https://github.com/openearth/ddlpy
import pandas as pd
# input parameters
start_date = pd.Timestamp(2019,9,1, tz="+01:00")
end_date = pd.Timestamp(2019,9,3, tz="+01:00")
locations = ddlpy.locations()
bool_hoedanigheid = locations['Hoedanigheid.Code'].isin(['NAP'])
bool_stations = locations.index.isin(['HARVT10'])
bool_grootheid = locations['Grootheid.Code'].isin(['WATHTE'])
locs_wathte = locations.loc[bool_grootheid & bool_hoedanigheid & bool_stations]
# retrieve with ddlpy
meas_wathte = ddlpy.measurements(locs_wathte.iloc[0], start_date=start_date, end_date=end_date)
Validation for timezones (independent of input tz, output tz is always MET, but this is accounted for in hatyan ddl code). It also succeeds when adjusting tz_ddl
. This is a useful testcase however:
import pandas as pd
import requests
date_start = "2009-01-01"
date_end = "2009-01-02"
tz_ddl = "+00:00"
url_ddl = 'https://waterwebservices.rijkswaterstaat.nl/ONLINEWAARNEMINGENSERVICES_DBO/OphalenWaarnemingen'
request_ddl = {'AquoPlusWaarnemingMetadata':
{'AquoMetadata':
{'Grootheid': {'Code': 'WATHTE'}, 'Groepering': {'Code': 'NVT'},
'Hoedanigheid': {'Code': 'NAP'},
# 'MeetApparaat': {'Code': '127'}
}
},
# 'Locatie': {'Locatie_MessageID': 10716, 'X': 571389.152745295, 'Y': 5694632.62008149, 'Naam': 'Walsoorden', 'Code': 'WALSODN'},
'Locatie': {'Locatie_MessageID': 10399, 'X': 576917.669784491, 'Y': 5759136.15818497, 'Naam': 'Hoek van Holland', 'Code': 'HOEKVHLD'},
'Periode': {'Begindatumtijd': f'{date_start}T00:00:00.000{tz_ddl}', 'Einddatumtijd': f'{date_end}T00:00:00.000{tz_ddl}'}}
resp = requests.post(url_ddl, json=request_ddl)
if not resp.ok:
raise Exception('%s for %s: %s'%(resp.reason, resp.url, str(resp.text)))
result = resp.json()
if not result['Succesvol']:
raise Exception('query not succesful, Foutmelding: %s from %s'%(result['Foutmelding'],url_ddl))
result_pd = pd.json_normalize(result['WaarnemingenLijst'][0]["MetingenLijst"])
print(result_pd[["Tijdstip","Meetwaarde.Waarde_Numeriek"]]) # 3 duplicate times for WALSODN 2010-01-01 etc
result_idx = result_pd[["Tijdstip","Meetwaarde.Waarde_Numeriek"]].set_index("Tijdstip")
result_idx.index = pd.DatetimeIndex(result_idx.index)
# load dfmt ddl dataset
meta_dict = {'Grootheid.Code':'WATHTE', 'Groepering.Code':'NVT', #combination for measured waterlevels
'Hoedanigheid.Code':'NAP', # vertical reference. Hoedanigheid is necessary for eg EURPFM/LICHTELGRE, where NAP and MSL values are available while it should only contain MSL #MSL, NAP, PLAATSLR, TAW, NVT (from cat_aquometadatalijst_waterhoogte['Hoedanigheid.Code'])
# 'MeetApparaat.Code':'127', # measurement device type. MeetApparaat.Code is necessary for IJMDBTHVN/ROOMPBTN/LICHTELGRE, where also radar measurements are available (all other stations are vlotter and these stations also have all important data in vlotter) TODO: Except LICHTELGRE/K13APFM which have Radar/Stappenbaak en Radar as MeetApparaat
}
import dfm_tools as dfmt
import xarray as xr
ddl_slev_gdf = dfmt.ssh_catalog_subset(source="rwsddl", meta_dict=meta_dict)
dir_output = "."
bool_hoek = ddl_slev_gdf["Code"] == "HOEKVHLD"
ddl_slev_gdf_sel = ddl_slev_gdf.loc[bool_hoek]
dfmt.ssh_retrieve_data(ssh_catalog_gpd=ddl_slev_gdf_sel, dir_output=dir_output, time_min=date_start, time_max=date_end, meta_dict=meta_dict)
ds = xr.open_dataset("HOEKVHLD.nc")
ts_dfmt_gmt = ds[["waterlevel"]].to_pandas()
ts_dfmt_gmt.index = ts_dfmt_gmt.index.tz_localize("UTC+00:00")
ts_dfmt_gmt.index = ts_dfmt_gmt.index.tz_convert("UTC+01:00")
ts_dfmt_gmt["waterlevel"] *= 100
del ds
# load ddlpy dataset
import ddlpy
locations = ddlpy.locations()
bool_groepering_ts = locations["Groepering.Code"].isin(["NVT"])
bool_stations = locations.index.isin(['HOEKVHLD'])
bool_grootheid = locations['Grootheid.Code'].isin(['WATHTE'])
selected_ts = locations.loc[bool_groepering_ts & bool_stations & bool_grootheid]
selected_ext = locations.loc[~bool_groepering_ts & bool_stations & bool_grootheid]
# if we pass one row to the measurements function you can get all the measurements
measurements_ts = ddlpy.measurements(selected_ts.iloc[0], date_start, date_end)
measurements_ext = ddlpy.measurements(selected_ext.iloc[0], date_start, date_end)
ts_ddlpy = measurements_ts[['Meetwaarde.Waarde_Numeriek']]
ts_ddlpy_ext = measurements_ext[['Meetwaarde.Waarde_Numeriek']]
# load validation dataset (MET tz)
import hatyan
file_dia = r"c:\DATA\hatyan_data_acceptancetests\predictie2019\HOEKVHLD_obs1.txt"
ts = hatyan.readts_dia(file_dia)
ts = ts.loc[slice(None, "2009-01-01")]
ts.index = ts.index.tz_localize("UTC+01:00")
ts["values"] *= 100
# compare sources
import matplotlib.pyplot as plt
plt.close("all")
fig, ax = plt.subplots()
ax.plot(result_idx["Meetwaarde.Waarde_Numeriek"], label="waterwebservices")
ax.plot(ts_dfmt_gmt["waterlevel"], "--", label="dfmt ddl (ddlpy)")
ax.plot(ts_ddlpy["Meetwaarde.Waarde_Numeriek"], "--", label="ddlpy")
ax.plot(ts_ddlpy_ext["Meetwaarde.Waarde_Numeriek"], "xk", label="ddlpy ext")
ax.plot(ts["values"], "x:", label="validation")
ax.legend(loc=1)
Get the timezone from a string with pd.Timestamp
to add to df.index
:
pd.Timestamp("2020-03-22 00:00:00 +01:00").tz
Out[65]: datetime.timezone(datetime.timedelta(seconds=3600))
After implementation of this issue and some cleanups in the example code we get a better comparison figure:
import pandas as pd
import requests
date_start = "2009-01-01 00:00:00 +01:00"
date_end = "2009-01-02 01:00:00 +01:00"
date_start_str = pd.Timestamp(date_start).isoformat(timespec='milliseconds')
date_end_str = pd.Timestamp(date_end).isoformat(timespec='milliseconds')
url_ddl = 'https://waterwebservices.rijkswaterstaat.nl/ONLINEWAARNEMINGENSERVICES_DBO/OphalenWaarnemingen'
request_ddl = {'AquoPlusWaarnemingMetadata':
{'AquoMetadata':
{'Grootheid': {'Code': 'WATHTE'}, 'Groepering': {'Code': 'NVT'},
'Hoedanigheid': {'Code': 'NAP'},
# 'MeetApparaat': {'Code': '127'}
}
},
# 'Locatie': {'Locatie_MessageID': 10716, 'X': 571389.152745295, 'Y': 5694632.62008149, 'Naam': 'Walsoorden', 'Code': 'WALSODN'},
'Locatie': {'Locatie_MessageID': 10399, 'X': 576917.669784491, 'Y': 5759136.15818497, 'Naam': 'Hoek van Holland', 'Code': 'HOEKVHLD'},
'Periode': {'Begindatumtijd': date_start_str, 'Einddatumtijd': date_end_str}}
resp = requests.post(url_ddl, json=request_ddl)
if not resp.ok:
raise Exception('%s for %s: %s'%(resp.reason, resp.url, str(resp.text)))
result = resp.json()
if not result['Succesvol']:
raise Exception('query not succesful, Foutmelding: %s from %s'%(result['Foutmelding'],url_ddl))
result_pd = pd.json_normalize(result['WaarnemingenLijst'][0]["MetingenLijst"])
print(result_pd[["Tijdstip","Meetwaarde.Waarde_Numeriek"]]) # 3 duplicate times for WALSODN 2010-01-01 etc
result_idx = result_pd[["Tijdstip","Meetwaarde.Waarde_Numeriek"]].set_index("Tijdstip")
result_idx.index = pd.DatetimeIndex(result_idx.index)
# load dfmt ddl dataset
meta_dict = {'Grootheid.Code':'WATHTE', 'Groepering.Code':'NVT', #combination for measured waterlevels
'Hoedanigheid.Code':'NAP', # vertical reference. Hoedanigheid is necessary for eg EURPFM/LICHTELGRE, where NAP and MSL values are available while it should only contain MSL #MSL, NAP, PLAATSLR, TAW, NVT (from cat_aquometadatalijst_waterhoogte['Hoedanigheid.Code'])
# 'MeetApparaat.Code':'127', # measurement device type. MeetApparaat.Code is necessary for IJMDBTHVN/ROOMPBTN/LICHTELGRE, where also radar measurements are available (all other stations are vlotter and these stations also have all important data in vlotter) TODO: Except LICHTELGRE/K13APFM which have Radar/Stappenbaak en Radar as MeetApparaat
}
import dfm_tools as dfmt
import xarray as xr
ddl_slev_gdf = dfmt.ssh_catalog_subset(source="rwsddl", meta_dict=meta_dict)
dir_output = "."
bool_hoek = ddl_slev_gdf["Code"] == "HOEKVHLD"
ddl_slev_gdf_sel = ddl_slev_gdf.loc[bool_hoek]
dfmt.ssh_retrieve_data(ssh_catalog_gpd=ddl_slev_gdf_sel, dir_output=dir_output, time_min=date_start, time_max=date_end)
ds = xr.open_dataset("HOEKVHLD.nc")
ts_dfmt_gmt = ds[["waterlevel"]].to_pandas()
ts_dfmt_gmt.index = ts_dfmt_gmt.index.tz_localize("UTC+00:00")
ts_dfmt_gmt.index = ts_dfmt_gmt.index.tz_convert("UTC+01:00")
ts_dfmt_gmt["waterlevel"] *= 100
del ds
# load ddlpy dataset
import ddlpy
locations = ddlpy.locations()
bool_groepering_ts = locations["Groepering.Code"].isin(["NVT"])
bool_stations = locations.index.isin(['HOEKVHLD'])
bool_grootheid = locations['Grootheid.Code'].isin(['WATHTE'])
selected_ts = locations.loc[bool_groepering_ts & bool_stations & bool_grootheid]
selected_ext = locations.loc[~bool_groepering_ts & bool_stations & bool_grootheid]
# if we pass one row to the measurements function you can get all the measurements
measurements_ts = ddlpy.measurements(selected_ts.iloc[0], date_start, date_end)
measurements_ext = ddlpy.measurements(selected_ext.iloc[0], date_start, date_end)
ts_ddlpy = measurements_ts[['Meetwaarde.Waarde_Numeriek']]
ts_ddlpy_ext = measurements_ext[['Meetwaarde.Waarde_Numeriek']]
# load validation dataset (MET tz)
import hatyan
file_dia = r"c:\DATA\hatyan_data_acceptancetests\predictie2019\HOEKVHLD_obs1.txt"
ts = hatyan.readts_dia(file_dia)
ts.index = ts.index.tz_localize("UTC+01:00")
ts = ts.loc[slice(date_start, date_end)]
ts["values"] *= 100
# compare sources
import matplotlib.pyplot as plt
plt.close("all")
fig, ax = plt.subplots(figsize=(10,5))
ax.plot(result_idx["Meetwaarde.Waarde_Numeriek"], label="waterwebservices")
ax.plot(ts_dfmt_gmt["waterlevel"], "--", label="dfmt ddl (ddlpy)")
ax.plot(ts_ddlpy["Meetwaarde.Waarde_Numeriek"], "--", label="ddlpy")
ax.plot(ts_ddlpy_ext["Meetwaarde.Waarde_Numeriek"], "xk", label="ddlpy ext")
ax.plot(ts["values"], "x:", label="validation diafile")
ax.legend(loc=1)
ax.grid()
fig.tight_layout()
The comparison figure shows that the time extents of all sources (ddlpy, ddl, dfm_tools and diafile) are perfectly aligned: