Deltares/ddlpy

Add support for different timezones

Closed this issue · 1 comments

  • ddlpy version: main
  • Python version: 3.11
  • Operating System: Windows

Description

When supplying a start_date/end_date, only UTC is supported. They are converted to the local timezone which is then fed into ddl. The timezones are consistent, date_start="2022-01-01" gives t0=2022-01-01 01:00:00+01:00. Consider to add support for different timezones, and use the what you give is what you get principle (timezone-wise). In that case, we also need to check if the timezones from date_start and date_end are equal.

Todo:

  • Rijkswaterstaat/wm-ws-dl#9 >> not necessary to wait for this one
  • add support for timezones >> only support input timezones, as output timezones is not straightforward for ddlpy.measurements_latest() and ddlpy.measurements_amount(). This also makes sure that the behaviour does not change with respect to older ddlpy versions.
  • not only for ddlpy.measurements(), also for other functions if needed
  • add testcase with time stings, and one with time zones
  • assert time start/stop in tests (incl timezone)
  • check/improve long code below

What I Did

When adding timezones, we get "ValueError: Not naive datetime (tzinfo is already set)":

import ddlpy # TODO: we require ddlpy from main/master branch (>0.1.0) >> pip install git+https://github.com/openearth/ddlpy
import pandas as pd

# input parameters
start_date  = pd.Timestamp(2019,9,1, tz="+01:00")
end_date = pd.Timestamp(2019,9,3, tz="+01:00")

locations = ddlpy.locations()
bool_hoedanigheid = locations['Hoedanigheid.Code'].isin(['NAP'])
bool_stations = locations.index.isin(['HARVT10'])
bool_grootheid = locations['Grootheid.Code'].isin(['WATHTE'])
locs_wathte = locations.loc[bool_grootheid & bool_hoedanigheid & bool_stations]

# retrieve with ddlpy
meas_wathte = ddlpy.measurements(locs_wathte.iloc[0], start_date=start_date, end_date=end_date)

Validation for timezones (independent of input tz, output tz is always MET, but this is accounted for in hatyan ddl code). It also succeeds when adjusting tz_ddl. This is a useful testcase however:

import pandas as pd
import requests

date_start = "2009-01-01"
date_end = "2009-01-02"

tz_ddl = "+00:00"
url_ddl = 'https://waterwebservices.rijkswaterstaat.nl/ONLINEWAARNEMINGENSERVICES_DBO/OphalenWaarnemingen'
request_ddl = {'AquoPlusWaarnemingMetadata': 
               {'AquoMetadata': 
                {'Grootheid': {'Code': 'WATHTE'}, 'Groepering': {'Code': 'NVT'}, 
                 'Hoedanigheid': {'Code': 'NAP'},
                 # 'MeetApparaat': {'Code': '127'}
                 }
                }, 
                # 'Locatie': {'Locatie_MessageID': 10716, 'X': 571389.152745295, 'Y': 5694632.62008149, 'Naam': 'Walsoorden', 'Code': 'WALSODN'}, 
                'Locatie': {'Locatie_MessageID': 10399, 'X': 576917.669784491, 'Y': 5759136.15818497, 'Naam': 'Hoek van Holland', 'Code': 'HOEKVHLD'},
                'Periode': {'Begindatumtijd': f'{date_start}T00:00:00.000{tz_ddl}', 'Einddatumtijd': f'{date_end}T00:00:00.000{tz_ddl}'}}
resp = requests.post(url_ddl, json=request_ddl)
if not resp.ok:
    raise Exception('%s for %s: %s'%(resp.reason, resp.url, str(resp.text)))
result = resp.json()
if not result['Succesvol']:
    raise Exception('query not succesful, Foutmelding: %s from %s'%(result['Foutmelding'],url_ddl))
result_pd = pd.json_normalize(result['WaarnemingenLijst'][0]["MetingenLijst"])
print(result_pd[["Tijdstip","Meetwaarde.Waarde_Numeriek"]]) # 3 duplicate times for WALSODN 2010-01-01 etc
result_idx = result_pd[["Tijdstip","Meetwaarde.Waarde_Numeriek"]].set_index("Tijdstip")
result_idx.index = pd.DatetimeIndex(result_idx.index)

# load dfmt ddl dataset
meta_dict = {'Grootheid.Code':'WATHTE', 'Groepering.Code':'NVT', #combination for measured waterlevels
             'Hoedanigheid.Code':'NAP', # vertical reference. Hoedanigheid is necessary for eg EURPFM/LICHTELGRE, where NAP and MSL values are available while it should only contain MSL #MSL, NAP, PLAATSLR, TAW, NVT (from cat_aquometadatalijst_waterhoogte['Hoedanigheid.Code'])
             # 'MeetApparaat.Code':'127', # measurement device type. MeetApparaat.Code is necessary for IJMDBTHVN/ROOMPBTN/LICHTELGRE, where also radar measurements are available (all other stations are vlotter and these stations also have all important data in vlotter) TODO: Except LICHTELGRE/K13APFM which have Radar/Stappenbaak en Radar as MeetApparaat
             }
import dfm_tools as dfmt
import xarray as xr
ddl_slev_gdf = dfmt.ssh_catalog_subset(source="rwsddl", meta_dict=meta_dict)
dir_output = "."
bool_hoek = ddl_slev_gdf["Code"] == "HOEKVHLD"
ddl_slev_gdf_sel = ddl_slev_gdf.loc[bool_hoek]
dfmt.ssh_retrieve_data(ssh_catalog_gpd=ddl_slev_gdf_sel, dir_output=dir_output, time_min=date_start, time_max=date_end, meta_dict=meta_dict)
ds = xr.open_dataset("HOEKVHLD.nc")
ts_dfmt_gmt = ds[["waterlevel"]].to_pandas()
ts_dfmt_gmt.index = ts_dfmt_gmt.index.tz_localize("UTC+00:00")
ts_dfmt_gmt.index = ts_dfmt_gmt.index.tz_convert("UTC+01:00")
ts_dfmt_gmt["waterlevel"] *= 100
del ds

# load ddlpy dataset
import ddlpy
locations = ddlpy.locations()
bool_groepering_ts = locations["Groepering.Code"].isin(["NVT"])
bool_stations = locations.index.isin(['HOEKVHLD'])
bool_grootheid = locations['Grootheid.Code'].isin(['WATHTE'])
selected_ts = locations.loc[bool_groepering_ts & bool_stations & bool_grootheid]
selected_ext = locations.loc[~bool_groepering_ts & bool_stations & bool_grootheid]
# if we pass one row to the measurements function you can get all the measurements
measurements_ts = ddlpy.measurements(selected_ts.iloc[0], date_start, date_end)
measurements_ext = ddlpy.measurements(selected_ext.iloc[0], date_start, date_end)

ts_ddlpy = measurements_ts[['Meetwaarde.Waarde_Numeriek']]
ts_ddlpy_ext = measurements_ext[['Meetwaarde.Waarde_Numeriek']]

# load validation dataset (MET tz)
import hatyan
file_dia = r"c:\DATA\hatyan_data_acceptancetests\predictie2019\HOEKVHLD_obs1.txt"
ts = hatyan.readts_dia(file_dia)
ts = ts.loc[slice(None, "2009-01-01")]
ts.index = ts.index.tz_localize("UTC+01:00")
ts["values"] *= 100

# compare sources
import matplotlib.pyplot as plt
plt.close("all")
fig, ax = plt.subplots()
ax.plot(result_idx["Meetwaarde.Waarde_Numeriek"], label="waterwebservices")
ax.plot(ts_dfmt_gmt["waterlevel"], "--", label="dfmt ddl (ddlpy)")
ax.plot(ts_ddlpy["Meetwaarde.Waarde_Numeriek"], "--", label="ddlpy")
ax.plot(ts_ddlpy_ext["Meetwaarde.Waarde_Numeriek"], "xk", label="ddlpy ext")
ax.plot(ts["values"], "x:", label="validation")
ax.legend(loc=1)

Gives:
image

Get the timezone from a string with pd.Timestamp to add to df.index:

pd.Timestamp("2020-03-22 00:00:00 +01:00").tz
Out[65]: datetime.timezone(datetime.timedelta(seconds=3600))

After implementation of this issue and some cleanups in the example code we get a better comparison figure:

import pandas as pd
import requests

date_start = "2009-01-01 00:00:00 +01:00"
date_end = "2009-01-02 01:00:00 +01:00"
date_start_str = pd.Timestamp(date_start).isoformat(timespec='milliseconds')
date_end_str = pd.Timestamp(date_end).isoformat(timespec='milliseconds')

url_ddl = 'https://waterwebservices.rijkswaterstaat.nl/ONLINEWAARNEMINGENSERVICES_DBO/OphalenWaarnemingen'
request_ddl = {'AquoPlusWaarnemingMetadata': 
               {'AquoMetadata': 
                {'Grootheid': {'Code': 'WATHTE'}, 'Groepering': {'Code': 'NVT'}, 
                 'Hoedanigheid': {'Code': 'NAP'},
                 # 'MeetApparaat': {'Code': '127'}
                 }
                }, 
                # 'Locatie': {'Locatie_MessageID': 10716, 'X': 571389.152745295, 'Y': 5694632.62008149, 'Naam': 'Walsoorden', 'Code': 'WALSODN'}, 
                'Locatie': {'Locatie_MessageID': 10399, 'X': 576917.669784491, 'Y': 5759136.15818497, 'Naam': 'Hoek van Holland', 'Code': 'HOEKVHLD'},
                'Periode': {'Begindatumtijd': date_start_str, 'Einddatumtijd': date_end_str}}
resp = requests.post(url_ddl, json=request_ddl)
if not resp.ok:
    raise Exception('%s for %s: %s'%(resp.reason, resp.url, str(resp.text)))
result = resp.json()
if not result['Succesvol']:
    raise Exception('query not succesful, Foutmelding: %s from %s'%(result['Foutmelding'],url_ddl))
result_pd = pd.json_normalize(result['WaarnemingenLijst'][0]["MetingenLijst"])
print(result_pd[["Tijdstip","Meetwaarde.Waarde_Numeriek"]]) # 3 duplicate times for WALSODN 2010-01-01 etc
result_idx = result_pd[["Tijdstip","Meetwaarde.Waarde_Numeriek"]].set_index("Tijdstip")
result_idx.index = pd.DatetimeIndex(result_idx.index)

# load dfmt ddl dataset
meta_dict = {'Grootheid.Code':'WATHTE', 'Groepering.Code':'NVT', #combination for measured waterlevels
             'Hoedanigheid.Code':'NAP', # vertical reference. Hoedanigheid is necessary for eg EURPFM/LICHTELGRE, where NAP and MSL values are available while it should only contain MSL #MSL, NAP, PLAATSLR, TAW, NVT (from cat_aquometadatalijst_waterhoogte['Hoedanigheid.Code'])
             # 'MeetApparaat.Code':'127', # measurement device type. MeetApparaat.Code is necessary for IJMDBTHVN/ROOMPBTN/LICHTELGRE, where also radar measurements are available (all other stations are vlotter and these stations also have all important data in vlotter) TODO: Except LICHTELGRE/K13APFM which have Radar/Stappenbaak en Radar as MeetApparaat
             }
import dfm_tools as dfmt
import xarray as xr
ddl_slev_gdf = dfmt.ssh_catalog_subset(source="rwsddl", meta_dict=meta_dict)
dir_output = "."
bool_hoek = ddl_slev_gdf["Code"] == "HOEKVHLD"
ddl_slev_gdf_sel = ddl_slev_gdf.loc[bool_hoek]
dfmt.ssh_retrieve_data(ssh_catalog_gpd=ddl_slev_gdf_sel, dir_output=dir_output, time_min=date_start, time_max=date_end)
ds = xr.open_dataset("HOEKVHLD.nc")
ts_dfmt_gmt = ds[["waterlevel"]].to_pandas()
ts_dfmt_gmt.index = ts_dfmt_gmt.index.tz_localize("UTC+00:00")
ts_dfmt_gmt.index = ts_dfmt_gmt.index.tz_convert("UTC+01:00")
ts_dfmt_gmt["waterlevel"] *= 100
del ds

# load ddlpy dataset
import ddlpy
locations = ddlpy.locations()
bool_groepering_ts = locations["Groepering.Code"].isin(["NVT"])
bool_stations = locations.index.isin(['HOEKVHLD'])
bool_grootheid = locations['Grootheid.Code'].isin(['WATHTE'])
selected_ts = locations.loc[bool_groepering_ts & bool_stations & bool_grootheid]
selected_ext = locations.loc[~bool_groepering_ts & bool_stations & bool_grootheid]
# if we pass one row to the measurements function you can get all the measurements
measurements_ts = ddlpy.measurements(selected_ts.iloc[0], date_start, date_end)
measurements_ext = ddlpy.measurements(selected_ext.iloc[0], date_start, date_end)

ts_ddlpy = measurements_ts[['Meetwaarde.Waarde_Numeriek']]
ts_ddlpy_ext = measurements_ext[['Meetwaarde.Waarde_Numeriek']]

# load validation dataset (MET tz)
import hatyan
file_dia = r"c:\DATA\hatyan_data_acceptancetests\predictie2019\HOEKVHLD_obs1.txt"
ts = hatyan.readts_dia(file_dia)
ts.index = ts.index.tz_localize("UTC+01:00")
ts = ts.loc[slice(date_start, date_end)]
ts["values"] *= 100

# compare sources
import matplotlib.pyplot as plt
plt.close("all")
fig, ax = plt.subplots(figsize=(10,5))
ax.plot(result_idx["Meetwaarde.Waarde_Numeriek"], label="waterwebservices")
ax.plot(ts_dfmt_gmt["waterlevel"], "--", label="dfmt ddl (ddlpy)")
ax.plot(ts_ddlpy["Meetwaarde.Waarde_Numeriek"], "--", label="ddlpy")
ax.plot(ts_ddlpy_ext["Meetwaarde.Waarde_Numeriek"], "xk", label="ddlpy ext")
ax.plot(ts["values"], "x:", label="validation diafile")
ax.legend(loc=1)
ax.grid()
fig.tight_layout()

The comparison figure shows that the time extents of all sources (ddlpy, ddl, dfm_tools and diafile) are perfectly aligned:
image