Running aerobulk on synthetic data
Opened this issue · 18 comments
Notebook shows that aerobulk and python wrapper is properly installed and running on a locally saved dataset! However, when I try to create "synthetic" data and run it through the algorithm, it crashes–either crashing my kernel or resulting in the various errors shown in the notebook. If I use the numpy version on a single timestep, it works–but a for loop on all times does not! This, in combination with the main error "too many indices for array: array is 0-dimensional, but 1 were indexed" makes me think that it's a problem with the way I created the data/the structure itself.
Any help very appreciated!
minimally_reproducible_example.pdf
# Valid data range according to documentation
# https://github.com/xgcm/aerobulk-python/blob/main/source/aerobulk/flux.py
VALID_VALUE_RANGES = {'sst': [270, 320],
't_zt': [180, 330],
'hum_zt': [0, 0.08],
'u_zu': [-50, 50],
'v_zu': [-50, 50],
'slp': [80000, 110000],
'rad_sw': [0, 1500],
'rad_lw': [0, 750]
}
sst = []
t_zt = []
hum_zt = []
u_zu = []
v_zu = []
slp = []
zt = []
zu = []
data_points = 10000
#INITIAL: don't specify any kind of distribution. Later, look at choices/ways in random to specify
#Later, also experiment with specifying a random difference, random sst, and adding for air temp
for data_point in range (0, data_points):
sst.append(float(decimal.Decimal(random.randrange(VALID_VALUE_RANGES['sst'][0]*1000, VALID_VALUE_RANGES['sst'][1]*1000))/1000)) #did *10 and then /10 to get 0.1 precision
t_zt.append(float(decimal.Decimal(random.randrange(VALID_VALUE_RANGES['t_zt'][0]*1000, VALID_VALUE_RANGES['t_zt'][1]*1000))/1000))
#t_zt.append(sst[data_point]+10) #instead of 10 do a random range of difference between 270-180 and 320-330
hum_zt.append(float(decimal.Decimal(random.randrange(VALID_VALUE_RANGES['hum_zt'][0]*1000000, VALID_VALUE_RANGES['hum_zt'][1]*1000000))/1000000))
u_zu.append(float(decimal.Decimal(random.randrange(VALID_VALUE_RANGES['u_zu'][0]*1000, VALID_VALUE_RANGES['u_zu'][1]*1000))/1000))
v_zu.append(float(0))
slp.append(101000.0)
zt.append(float(decimal.Decimal(random.randrange(12*10,19*10)/10))) #use 12 to 19, mirroring other dataset
zu.append(float(decimal.Decimal(random.randrange(15*10,22*10)/10))) #use 15 to 22, mirroring other dataset
# Cast scalar into numpy array
sst = np.reshape(sst,(-1,1))
t_zt = np.reshape(t_zt,(-1,1))
hum_zt = np.reshape(hum_zt,(-1,1))
u_zu = np.reshape(u_zu,(-1,1))
v_zu = np.reshape(v_zu,(-1,1))
zt = np.reshape(zt,(-1,1))
zu = np.reshape(zu,(-1,1))
slp = np.reshape(np.array([101000.0]),(-1,1))
algo = 'coare3p6'
ql, qh, taux, tauy, evap = noskin_np(sst=sst, t_zt=t_zt, hum_zt=hum_zt, u_zu=u_zu, v_zu=v_zu, slp=slp, algo=algo, zt=zt, zu=zu,
niter=10, input_range_check=True)
Or ideally, the below would run, with the xarray version
import numpy as np
import pandas as pd
import datetime as dt
from datetime import datetime
import xarray as xr
import matplotlib.pyplot as plt
from matplotlib import cm
import matplotlib.dates as mdates
import cartopy.crs as ccrs
import random
import decimal
import aerobulk
from aerobulk.flux import noskin_np, skin_np, noskin, skin
# Valid data range according to documentation
# https://github.com/xgcm/aerobulk-python/blob/main/source/aerobulk/flux.py
VALID_VALUE_RANGES = {'sst': [270, 320],
't_zt': [180, 330],
'hum_zt': [0, 0.08],
'u_zu': [-50, 50],
'v_zu': [-50, 50],
'slp': [80000, 110000],
'rad_sw': [0, 1500],
'rad_lw': [0, 750]
}
sst = []
t_zt = []
hum_zt = []
u_zu = []
v_zu = []
slp = []
zt = []
zu = []
data_points = 10000
for data_point in range (0, data_points):
sst.append(float(decimal.Decimal(random.randrange(VALID_VALUE_RANGES['sst'][0]*1000, VALID_VALUE_RANGES['sst'][1]*1000))/1000)) #did *10 and then /10 to get 0.1 precision
t_zt.append(float(decimal.Decimal(random.randrange(VALID_VALUE_RANGES['t_zt'][0]*1000, VALID_VALUE_RANGES['t_zt'][1]*1000))/1000))
#t_zt.append(sst[data_point]+10) #instead of 10 do a random range of difference between 270-180 and 320-330
hum_zt.append(float(decimal.Decimal(random.randrange(VALID_VALUE_RANGES['hum_zt'][0]*1000000, VALID_VALUE_RANGES['hum_zt'][1]*1000000))/1000000))
u_zu.append(float(decimal.Decimal(random.randrange(VALID_VALUE_RANGES['u_zu'][0]*1000, VALID_VALUE_RANGES['u_zu'][1]*1000))/1000))
v_zu.append(float(0))
slp.append(101000.0)
zt.append(float(decimal.Decimal(random.randrange(12*10,19*10)/10))) #use 12 to 19, mirroring other dataset
zu.append(float(decimal.Decimal(random.randrange(15*10,22*10)/10))) #use 15 to 22, mirroring other dataset
algo = 'coare3p6'
ql, qh, taux, tauy, evap = noskin(sst=sst, t_zt=t_zt, hum_zt=hum_zt, u_zu=u_zu, v_zu=v_zu, slp=slp, algo=algo, zt=zt, zu=zu,
niter=10, input_range_check=True)
Or ideally, the below would run, with the xarray version
# Valid data range according to documentation # https://github.com/xgcm/aerobulk-python/blob/main/source/aerobulk/flux.py VALID_VALUE_RANGES = {'sst': [270, 320], 't_zt': [180, 330], 'hum_zt': [0, 0.08], 'u_zu': [-50, 50], 'v_zu': [-50, 50], 'slp': [80000, 110000], 'rad_sw': [0, 1500], 'rad_lw': [0, 750] } sst = [] t_zt = [] hum_zt = [] u_zu = [] v_zu = [] slp = [] zt = [] zu = [] data_points = 10000 for data_point in range (0, data_points): sst.append(float(decimal.Decimal(random.randrange(VALID_VALUE_RANGES['sst'][0]*1000, VALID_VALUE_RANGES['sst'][1]*1000))/1000)) #did *10 and then /10 to get 0.1 precision t_zt.append(float(decimal.Decimal(random.randrange(VALID_VALUE_RANGES['t_zt'][0]*1000, VALID_VALUE_RANGES['t_zt'][1]*1000))/1000)) #t_zt.append(sst[data_point]+10) #instead of 10 do a random range of difference between 270-180 and 320-330 hum_zt.append(float(decimal.Decimal(random.randrange(VALID_VALUE_RANGES['hum_zt'][0]*1000000, VALID_VALUE_RANGES['hum_zt'][1]*1000000))/1000000)) u_zu.append(float(decimal.Decimal(random.randrange(VALID_VALUE_RANGES['u_zu'][0]*1000, VALID_VALUE_RANGES['u_zu'][1]*1000))/1000)) v_zu.append(float(0)) slp.append(101000.0) zt.append(float(decimal.Decimal(random.randrange(12*10,19*10)/10))) #use 12 to 19, mirroring other dataset zu.append(float(decimal.Decimal(random.randrange(15*10,22*10)/10))) #use 15 to 22, mirroring other dataset algo = 'coare3p6' ql, qh, taux, tauy, evap = noskin(sst=sst, t_zt=t_zt, hum_zt=hum_zt, u_zu=u_zu, v_zu=v_zu, slp=slp, algo=algo, zt=zt, zu=zu, niter=10, input_range_check=True)
This seems to be missing the import statements to reproduce, could you add them?
I will have to jump off for a bit but here are some suggestions how to proceed:
The error you are getting suggests that the issue lies in these lines
So here is how I would proceed:
- Include the imports necessary to fully reproduce the code above
- Reduce the size of the example above. Something like 3 values should be sufficient to generalize the problem. You could do the same with your original data (that way we can actually print the values without producing overwhelming output)
- Then for each dataset reproduce the
ocean_index
asocean_index = np.where(~np.isnan(sst))
and print the output. Something in the creation there gets screwed and this might give us a hint.
Will check in later with this.
Inputs:
import numpy as np
import pandas as pd
import datetime as dt
from datetime import datetime
import xarray as xr
import matplotlib.pyplot as plt
from matplotlib import cm
import matplotlib.dates as mdates
import cartopy.crs as ccrs
import random
import decimal
import aerobulk
from aerobulk.flux import noskin_np, skin_np, noskin, skin
Could there be something telling in the way that data is created in the aerobulk-python test py file, here: (https://github.com/xgcm/aerobulk-python/blob/f1affeb275ae86ea21f3b5a59d83f9238d380cce/tests/create_test_data.py)?
Could there be something telling in the way that data is created in the aerobulk-python test py file, here: (https://github.com/xgcm/aerobulk-python/blob/f1affeb275ae86ea21f3b5a59d83f9238d380cce/tests/create_test_data.py)?
Yes comparing these and figuring out what the difference are would be helpful for sure.
when I run it like this, the ocean index/args_shrunk works for both the real and the synthetic data, but trying to call aerobulk with the synthetic data still crashes
Created this to outline the current state, where both np/xr are running the real data (ds_clean) but neither are running on the synthetic–though notably again, one timestep of the np IS working. Attempting the same for loop that works on the real data crashes on the synthetic data. I’m trying to think of what issues would be present in the data structure that would be a moot point/gone if we just take a single point?
real-synthetic_np-xr_comparisons (reduced).pdf
Ok so the issue with the xarray wrapper was that the inputs were lists (not numpy arrays or as required here xarray.Dataarrays).
This works for me with the newest (v0.4.0) of aerobulk-python:
import numpy as np
import pandas as pd
import datetime as dt
from datetime import datetime
import xarray as xr
import matplotlib.pyplot as plt
from matplotlib import cm
import matplotlib.dates as mdates
import cartopy.crs as ccrs
import random
import decimal
import aerobulk
from aerobulk.flux import noskin_np, skin_np, noskin, skin
# Valid data range according to documentation
# https://github.com/xgcm/aerobulk-python/blob/main/source/aerobulk/flux.py
VALID_VALUE_RANGES = {'sst': [270, 320],
't_zt': [180, 330],
'hum_zt': [0, 0.08],
'u_zu': [-50, 50],
'v_zu': [-50, 50],
'slp': [80000, 110000],
'rad_sw': [0, 1500],
'rad_lw': [0, 750]
}
sst = []
t_zt = []
hum_zt = []
u_zu = []
v_zu = []
slp = []
zt = []
zu = []
data_points = 3
for data_point in range (0, data_points):
sst.append(float(decimal.Decimal(random.randrange(VALID_VALUE_RANGES['sst'][0]*1000, VALID_VALUE_RANGES['sst'][1]*1000))/1000)) #did *10 and then /10 to get 0.1 precision
t_zt.append(float(decimal.Decimal(random.randrange(VALID_VALUE_RANGES['t_zt'][0]*1000, VALID_VALUE_RANGES['t_zt'][1]*1000))/1000))
#t_zt.append(sst[data_point]+10) #instead of 10 do a random range of difference between 270-180 and 320-330
hum_zt.append(float(decimal.Decimal(random.randrange(VALID_VALUE_RANGES['hum_zt'][0]*1000000, VALID_VALUE_RANGES['hum_zt'][1]*1000000))/1000000))
u_zu.append(float(decimal.Decimal(random.randrange(VALID_VALUE_RANGES['u_zu'][0]*1000, VALID_VALUE_RANGES['u_zu'][1]*1000))/1000))
v_zu.append(float(0))
slp.append(101000.0)
zt.append(float(decimal.Decimal(random.randrange(12*10,19*10)/10))) #use 12 to 19, mirroring other dataset
zu.append(float(decimal.Decimal(random.randrange(15*10,22*10)/10))) #use 15 to 22, mirroring other dataset
def wrap_da(input:np.ndarray) -> xr.DataArray:
return xr.DataArray(input, dims={'time':np.arange(len(input))})
algo = 'coare3p6'
ql, qh, taux, tauy, evap = noskin(
sst=wrap_da(sst),
t_zt=wrap_da(t_zt),
hum_zt=wrap_da(hum_zt),
u_zu=wrap_da(u_zu),
v_zu=wrap_da(v_zu),
slp=wrap_da(slp),
algo=algo,
zt=wrap_da(zt),
zu=wrap_da(zu),
niter=10,
input_range_check=True
)
I have raised #80 to clarify the input for the range checking. Also very importantly the results in #81 shows that even if this works, it will likely produce the wrong results in your case here! We need further investigate how we could support this out of the box, but for now I think you will have to code a workaround, where you apply the wrapper to each single datapoint of your observation, and then concatenate the results.
Ok so the issue with the xarray wrapper was that the inputs were lists (not numpy arrays or as required here xarray.Dataarrays).
This works for me with the newest (v0.4.0) of aerobulk-python:
import numpy as np import pandas as pd import datetime as dt from datetime import datetime import xarray as xr import matplotlib.pyplot as plt from matplotlib import cm import matplotlib.dates as mdates import cartopy.crs as ccrs import random import decimal import aerobulk from aerobulk.flux import noskin_np, skin_np, noskin, skin # Valid data range according to documentation # https://github.com/xgcm/aerobulk-python/blob/main/source/aerobulk/flux.py VALID_VALUE_RANGES = {'sst': [270, 320], 't_zt': [180, 330], 'hum_zt': [0, 0.08], 'u_zu': [-50, 50], 'v_zu': [-50, 50], 'slp': [80000, 110000], 'rad_sw': [0, 1500], 'rad_lw': [0, 750] } sst = [] t_zt = [] hum_zt = [] u_zu = [] v_zu = [] slp = [] zt = [] zu = [] data_points = 3 for data_point in range (0, data_points): sst.append(float(decimal.Decimal(random.randrange(VALID_VALUE_RANGES['sst'][0]*1000, VALID_VALUE_RANGES['sst'][1]*1000))/1000)) #did *10 and then /10 to get 0.1 precision t_zt.append(float(decimal.Decimal(random.randrange(VALID_VALUE_RANGES['t_zt'][0]*1000, VALID_VALUE_RANGES['t_zt'][1]*1000))/1000)) #t_zt.append(sst[data_point]+10) #instead of 10 do a random range of difference between 270-180 and 320-330 hum_zt.append(float(decimal.Decimal(random.randrange(VALID_VALUE_RANGES['hum_zt'][0]*1000000, VALID_VALUE_RANGES['hum_zt'][1]*1000000))/1000000)) u_zu.append(float(decimal.Decimal(random.randrange(VALID_VALUE_RANGES['u_zu'][0]*1000, VALID_VALUE_RANGES['u_zu'][1]*1000))/1000)) v_zu.append(float(0)) slp.append(101000.0) zt.append(float(decimal.Decimal(random.randrange(12*10,19*10)/10))) #use 12 to 19, mirroring other dataset zu.append(float(decimal.Decimal(random.randrange(15*10,22*10)/10))) #use 15 to 22, mirroring other dataset def wrap_da(input:np.ndarray) -> xr.DataArray: return xr.DataArray(input, dims={'time':np.arange(len(input))}) algo = 'coare3p6' ql, qh, taux, tauy, evap = noskin( sst=wrap_da(sst), t_zt=wrap_da(t_zt), hum_zt=wrap_da(hum_zt), u_zu=wrap_da(u_zu), v_zu=wrap_da(v_zu), slp=wrap_da(slp), algo=algo, zt=wrap_da(zt), zu=wrap_da(zu), niter=10, input_range_check=True )
If I try to run this copy/pasted, I get the below error!
I have raised #80 to clarify the input for the range checking. Also very importantly the results in #81 shows that even if this works, it will likely produce the wrong results in your case here! We need further investigate how we could support this out of the box, but for now I think you will have to code a workaround, where you apply the wrapper to each single datapoint of your observation, and then concatenate the results.
HUGE, thank you so much for catching this!!
Oddly, if I take out the wrap_da on zu and zt, it works!!
import numpy as np
import pandas as pd
import datetime as dt
from datetime import datetime
import xarray as xr
import matplotlib.pyplot as plt
from matplotlib import cm
import matplotlib.dates as mdates
import cartopy.crs as ccrs
import random
import decimal
import aerobulk
from aerobulk.flux import noskin_np, skin_np, noskin, skin
# Valid data range according to documentation
# https://github.com/xgcm/aerobulk-python/blob/main/source/aerobulk/flux.py
VALID_VALUE_RANGES = {'sst': [270, 320],
't_zt': [180, 330],
'hum_zt': [0, 0.08],
'u_zu': [-50, 50],
'v_zu': [-50, 50],
'slp': [80000, 110000],
'rad_sw': [0, 1500],
'rad_lw': [0, 750]
}
sst = []
t_zt = []
hum_zt = []
u_zu = []
v_zu = []
slp = []
zt = []
zu = []
data_points = 3
for data_point in range (0, data_points):
sst.append(float(decimal.Decimal(random.randrange(VALID_VALUE_RANGES['sst'][0]*1000, VALID_VALUE_RANGES['sst'][1]*1000))/1000)) #did *10 and then /10 to get 0.1 precision
t_zt.append(float(decimal.Decimal(random.randrange(VALID_VALUE_RANGES['t_zt'][0]*1000, VALID_VALUE_RANGES['t_zt'][1]*1000))/1000))
#t_zt.append(sst[data_point]+10) #instead of 10 do a random range of difference between 270-180 and 320-330
hum_zt.append(float(decimal.Decimal(random.randrange(VALID_VALUE_RANGES['hum_zt'][0]*1000000, VALID_VALUE_RANGES['hum_zt'][1]*1000000))/1000000))
u_zu.append(float(decimal.Decimal(random.randrange(VALID_VALUE_RANGES['u_zu'][0]*1000, VALID_VALUE_RANGES['u_zu'][1]*1000))/1000))
v_zu.append(float(0))
slp.append(101000.0)
zt.append(float(decimal.Decimal(random.randrange(12*10,19*10)/10))) #use 12 to 19, mirroring other dataset
zu.append(float(decimal.Decimal(random.randrange(15*10,22*10)/10))) #use 15 to 22, mirroring other dataset
def wrap_da(input:np.ndarray) -> xr.DataArray:
return xr.DataArray(input, dims={'time':np.arange(len(input))})
algo = 'coare3p6'
ql, qh, taux, tauy, evap = noskin(
sst=wrap_da(sst),
t_zt=wrap_da(t_zt),
hum_zt=wrap_da(hum_zt),
u_zu=wrap_da(u_zu),
v_zu=wrap_da(v_zu),
slp=wrap_da(slp),
algo=algo,
zt=zt,
zu=zu,
niter=10,
input_range_check=True
)
I suspect that either the wrapper or the fortran code will just pick the first value of any iterable (including a list as given here), since these inputs are not manipulated on the xarray/numpy level like the other inputs, this makes sense. I think this will still give you 2 identical output values like in #81 though, right?