Running aerobulk on synthetic data

Question

Running aerobulk on synthetic data

Opened this issue 10 months ago · 18 comments

Notebook shows that aerobulk and python wrapper is properly installed and running on a locally saved dataset! However, when I try to create "synthetic" data and run it through the algorithm, it crashes–either crashing my kernel or resulting in the various errors shown in the notebook. If I use the numpy version on a single timestep, it works–but a for loop on all times does not! This, in combination with the main error "too many indices for array: array is 0-dimensional, but 1 were indexed" makes me think that it's a problem with the way I created the data/the structure itself.

Any help very appreciated!
minimally_reproducible_example.pdf

Answer 1 · 2024-03-20T19:39:58.000Z

# Valid data range according to documentation
# https://github.com/xgcm/aerobulk-python/blob/main/source/aerobulk/flux.py
VALID_VALUE_RANGES = {'sst': [270, 320],
                      't_zt': [180, 330],
                      'hum_zt': [0, 0.08],
                      'u_zu': [-50, 50],
                      'v_zu': [-50, 50],
                      'slp': [80000, 110000],
                      'rad_sw': [0, 1500],
                      'rad_lw': [0, 750]
                     }
sst = [] 
t_zt = []                           
hum_zt = [] 
u_zu = []                       
v_zu = [] 
slp = [] 
zt = []                               
zu = []   

data_points = 10000

#INITIAL: don't specify any kind of distribution. Later, look at choices/ways in random to specify
#Later, also experiment with specifying a random difference, random sst, and adding for air temp
for data_point in range (0, data_points):
    sst.append(float(decimal.Decimal(random.randrange(VALID_VALUE_RANGES['sst'][0]*1000, VALID_VALUE_RANGES['sst'][1]*1000))/1000)) #did *10 and then /10 to get 0.1 precision
    t_zt.append(float(decimal.Decimal(random.randrange(VALID_VALUE_RANGES['t_zt'][0]*1000, VALID_VALUE_RANGES['t_zt'][1]*1000))/1000))  
    #t_zt.append(sst[data_point]+10)  #instead of 10 do a random range of difference between 270-180 and 320-330
    hum_zt.append(float(decimal.Decimal(random.randrange(VALID_VALUE_RANGES['hum_zt'][0]*1000000, VALID_VALUE_RANGES['hum_zt'][1]*1000000))/1000000))
    u_zu.append(float(decimal.Decimal(random.randrange(VALID_VALUE_RANGES['u_zu'][0]*1000, VALID_VALUE_RANGES['u_zu'][1]*1000))/1000))                         
    v_zu.append(float(0)) 
    slp.append(101000.0)
    
    zt.append(float(decimal.Decimal(random.randrange(12*10,19*10)/10))) #use 12 to 19, mirroring other dataset                                
    zu.append(float(decimal.Decimal(random.randrange(15*10,22*10)/10))) #use 15 to 22, mirroring other dataset   
    
# Cast scalar into numpy array
sst = np.reshape(sst,(-1,1))
t_zt = np.reshape(t_zt,(-1,1))
hum_zt = np.reshape(hum_zt,(-1,1))
u_zu = np.reshape(u_zu,(-1,1))
v_zu = np.reshape(v_zu,(-1,1))
zt = np.reshape(zt,(-1,1))
zu = np.reshape(zu,(-1,1))
slp = np.reshape(np.array([101000.0]),(-1,1))

algo = 'coare3p6'
ql, qh, taux, tauy, evap = noskin_np(sst=sst, t_zt=t_zt, hum_zt=hum_zt, u_zu=u_zu, v_zu=v_zu, slp=slp, algo=algo, zt=zt, zu=zu, 
                                     niter=10, input_range_check=True)

Answer 2 · 2024-03-20T19:41:30.000Z

Or ideally, the below would run, with the xarray version

import numpy as np

import pandas as pd
import datetime as dt
from datetime import datetime
import xarray as xr

import matplotlib.pyplot as plt
from matplotlib import cm
import matplotlib.dates as mdates
import cartopy.crs as ccrs

import random 
import decimal
import aerobulk
from aerobulk.flux import noskin_np, skin_np, noskin, skin

# Valid data range according to documentation
# https://github.com/xgcm/aerobulk-python/blob/main/source/aerobulk/flux.py
VALID_VALUE_RANGES = {'sst': [270, 320],
                      't_zt': [180, 330],
                      'hum_zt': [0, 0.08],
                      'u_zu': [-50, 50],
                      'v_zu': [-50, 50],
                      'slp': [80000, 110000],
                      'rad_sw': [0, 1500],
                      'rad_lw': [0, 750]
                     }
sst = [] 
t_zt = []                           
hum_zt = [] 
u_zu = []                       
v_zu = [] 
slp = [] 
zt = []                               
zu = []   

data_points = 10000

for data_point in range (0, data_points):
    sst.append(float(decimal.Decimal(random.randrange(VALID_VALUE_RANGES['sst'][0]*1000, VALID_VALUE_RANGES['sst'][1]*1000))/1000)) #did *10 and then /10 to get 0.1 precision
    t_zt.append(float(decimal.Decimal(random.randrange(VALID_VALUE_RANGES['t_zt'][0]*1000, VALID_VALUE_RANGES['t_zt'][1]*1000))/1000))  
    #t_zt.append(sst[data_point]+10)  #instead of 10 do a random range of difference between 270-180 and 320-330
    hum_zt.append(float(decimal.Decimal(random.randrange(VALID_VALUE_RANGES['hum_zt'][0]*1000000, VALID_VALUE_RANGES['hum_zt'][1]*1000000))/1000000))
    u_zu.append(float(decimal.Decimal(random.randrange(VALID_VALUE_RANGES['u_zu'][0]*1000, VALID_VALUE_RANGES['u_zu'][1]*1000))/1000))                         
    v_zu.append(float(0)) 
    slp.append(101000.0)
    
    zt.append(float(decimal.Decimal(random.randrange(12*10,19*10)/10))) #use 12 to 19, mirroring other dataset                                
    zu.append(float(decimal.Decimal(random.randrange(15*10,22*10)/10))) #use 15 to 22, mirroring other dataset   

algo = 'coare3p6'
ql, qh, taux, tauy, evap = noskin(sst=sst, t_zt=t_zt, hum_zt=hum_zt, u_zu=u_zu, v_zu=v_zu, slp=slp, algo=algo, zt=zt, zu=zu, 
                                     niter=10, input_range_check=True)

Answer 3 · 2024-03-20T19:44:16.000Z

The actual error message varies, but it usually is a problem with this line

Answer 4 · 2024-03-20T20:25:49.000Z

Comparing the real observational dataset (which runs with both numpy and xarray wrappers) with the synthetic data created here:

Answer 5 · 2024-03-20T20:25:56.000Z

Or ideally, the below would run, with the xarray version

# Valid data range according to documentation
# https://github.com/xgcm/aerobulk-python/blob/main/source/aerobulk/flux.py
VALID_VALUE_RANGES = {'sst': [270, 320],
                      't_zt': [180, 330],
                      'hum_zt': [0, 0.08],
                      'u_zu': [-50, 50],
                      'v_zu': [-50, 50],
                      'slp': [80000, 110000],
                      'rad_sw': [0, 1500],
                      'rad_lw': [0, 750]
                     }
sst = [] 
t_zt = []                           
hum_zt = [] 
u_zu = []                       
v_zu = [] 
slp = [] 
zt = []                               
zu = []   

data_points = 10000

for data_point in range (0, data_points):
    sst.append(float(decimal.Decimal(random.randrange(VALID_VALUE_RANGES['sst'][0]*1000, VALID_VALUE_RANGES['sst'][1]*1000))/1000)) #did *10 and then /10 to get 0.1 precision
    t_zt.append(float(decimal.Decimal(random.randrange(VALID_VALUE_RANGES['t_zt'][0]*1000, VALID_VALUE_RANGES['t_zt'][1]*1000))/1000))  
    #t_zt.append(sst[data_point]+10)  #instead of 10 do a random range of difference between 270-180 and 320-330
    hum_zt.append(float(decimal.Decimal(random.randrange(VALID_VALUE_RANGES['hum_zt'][0]*1000000, VALID_VALUE_RANGES['hum_zt'][1]*1000000))/1000000))
    u_zu.append(float(decimal.Decimal(random.randrange(VALID_VALUE_RANGES['u_zu'][0]*1000, VALID_VALUE_RANGES['u_zu'][1]*1000))/1000))                         
    v_zu.append(float(0)) 
    slp.append(101000.0)
    
    zt.append(float(decimal.Decimal(random.randrange(12*10,19*10)/10))) #use 12 to 19, mirroring other dataset                                
    zu.append(float(decimal.Decimal(random.randrange(15*10,22*10)/10))) #use 15 to 22, mirroring other dataset   

algo = 'coare3p6'
ql, qh, taux, tauy, evap = noskin(sst=sst, t_zt=t_zt, hum_zt=hum_zt, u_zu=u_zu, v_zu=v_zu, slp=slp, algo=algo, zt=zt, zu=zu, 
                                     niter=10, input_range_check=True)

This seems to be missing the import statements to reproduce, could you add them?

Answer 6 · 2024-03-20T20:37:53.000Z

I will have to jump off for a bit but here are some suggestions how to proceed:

The error you are getting suggests that the issue lies in these lines

So here is how I would proceed:

Include the imports necessary to fully reproduce the code above
Reduce the size of the example above. Something like 3 values should be sufficient to generalize the problem. You could do the same with your original data (that way we can actually print the values without producing overwhelming output)
Then for each dataset reproduce the ocean_index as ocean_index = np.where(~np.isnan(sst)) and print the output. Something in the creation there gets screwed and this might give us a hint.

Will check in later with this.

Answer 7 · 2024-03-20T20:42:22.000Z

Comparing the real observational dataset (which runs with both numpy and xarray wrappers) with the synthetic data created here:

Answer 8 · 2024-03-20T20:43:35.000Z

Inputs:

import numpy as np

import pandas as pd
import datetime as dt
from datetime import datetime
import xarray as xr

import matplotlib.pyplot as plt
from matplotlib import cm
import matplotlib.dates as mdates
import cartopy.crs as ccrs

import random 
import decimal
import aerobulk
from aerobulk.flux import noskin_np, skin_np, noskin, skin

Answer 9 · 2024-03-20T21:10:44.000Z

Could there be something telling in the way that data is created in the aerobulk-python test py file, here: (https://github.com/xgcm/aerobulk-python/blob/f1affeb275ae86ea21f3b5a59d83f9238d380cce/tests/create_test_data.py)?

Answer 10 · 2024-03-20T21:25:49.000Z

Could there be something telling in the way that data is created in the aerobulk-python test py file, here: (https://github.com/xgcm/aerobulk-python/blob/f1affeb275ae86ea21f3b5a59d83f9238d380cce/tests/create_test_data.py)?

Yes comparing these and figuring out what the difference are would be helpful for sure.

Answer 11 · 2024-03-20T21:44:47.000Z

ocean_index_check.pdf

when I run it like this, the ocean index/args_shrunk works for both the real and the synthetic data, but trying to call aerobulk with the synthetic data still crashes

Answer 12 · 2024-03-20T22:28:13.000Z

Created this to outline the current state, where both np/xr are running the real data (ds_clean) but neither are running on the synthetic–though notably again, one timestep of the np IS working. Attempting the same for loop that works on the real data crashes on the synthetic data. I’m trying to think of what issues would be present in the data structure that would be a moot point/gone if we just take a single point?
real-synthetic_np-xr_comparisons (reduced).pdf

Answer 13 · 2024-03-20T23:00:43.000Z

Ok so the issue with the xarray wrapper was that the inputs were lists (not numpy arrays or as required here xarray.Dataarrays).

This works for me with the newest (v0.4.0) of aerobulk-python:

import numpy as np

import pandas as pd
import datetime as dt
from datetime import datetime
import xarray as xr

import matplotlib.pyplot as plt
from matplotlib import cm
import matplotlib.dates as mdates
import cartopy.crs as ccrs

import random 
import decimal
import aerobulk
from aerobulk.flux import noskin_np, skin_np, noskin, skin

# Valid data range according to documentation
# https://github.com/xgcm/aerobulk-python/blob/main/source/aerobulk/flux.py
VALID_VALUE_RANGES = {'sst': [270, 320],
                      't_zt': [180, 330],
                      'hum_zt': [0, 0.08],
                      'u_zu': [-50, 50],
                      'v_zu': [-50, 50],
                      'slp': [80000, 110000],
                      'rad_sw': [0, 1500],
                      'rad_lw': [0, 750]
                     }
sst = [] 
t_zt = []                           
hum_zt = [] 
u_zu = []                       
v_zu = [] 
slp = [] 
zt = []                               
zu = []   

data_points = 3

for data_point in range (0, data_points):
    sst.append(float(decimal.Decimal(random.randrange(VALID_VALUE_RANGES['sst'][0]*1000, VALID_VALUE_RANGES['sst'][1]*1000))/1000)) #did *10 and then /10 to get 0.1 precision
    t_zt.append(float(decimal.Decimal(random.randrange(VALID_VALUE_RANGES['t_zt'][0]*1000, VALID_VALUE_RANGES['t_zt'][1]*1000))/1000))  
    #t_zt.append(sst[data_point]+10)  #instead of 10 do a random range of difference between 270-180 and 320-330
    hum_zt.append(float(decimal.Decimal(random.randrange(VALID_VALUE_RANGES['hum_zt'][0]*1000000, VALID_VALUE_RANGES['hum_zt'][1]*1000000))/1000000))
    u_zu.append(float(decimal.Decimal(random.randrange(VALID_VALUE_RANGES['u_zu'][0]*1000, VALID_VALUE_RANGES['u_zu'][1]*1000))/1000))                         
    v_zu.append(float(0)) 
    slp.append(101000.0)
    
    zt.append(float(decimal.Decimal(random.randrange(12*10,19*10)/10))) #use 12 to 19, mirroring other dataset                                
    zu.append(float(decimal.Decimal(random.randrange(15*10,22*10)/10))) #use 15 to 22, mirroring other dataset   

def wrap_da(input:np.ndarray) -> xr.DataArray:
    return xr.DataArray(input, dims={'time':np.arange(len(input))})

algo = 'coare3p6'
ql, qh, taux, tauy, evap = noskin(
    sst=wrap_da(sst),
    t_zt=wrap_da(t_zt), 
    hum_zt=wrap_da(hum_zt), 
    u_zu=wrap_da(u_zu), 
    v_zu=wrap_da(v_zu), 
    slp=wrap_da(slp), 
    algo=algo, 
    zt=wrap_da(zt), 
    zu=wrap_da(zu), 
    niter=10, 
    input_range_check=True
)

Answer 14 · 2024-03-20T23:34:06.000Z

I have raised #80 to clarify the input for the range checking. Also very importantly the results in #81 shows that even if this works, it will likely produce the wrong results in your case here! We need further investigate how we could support this out of the box, but for now I think you will have to code a workaround, where you apply the wrapper to each single datapoint of your observation, and then concatenate the results.

Answer 15 · 2024-03-20T23:39:21.000Z

Ok so the issue with the xarray wrapper was that the inputs were lists (not numpy arrays or as required here xarray.Dataarrays).

This works for me with the newest (v0.4.0) of aerobulk-python:

import numpy as np

import pandas as pd
import datetime as dt
from datetime import datetime
import xarray as xr

import matplotlib.pyplot as plt
from matplotlib import cm
import matplotlib.dates as mdates
import cartopy.crs as ccrs

import random 
import decimal
import aerobulk
from aerobulk.flux import noskin_np, skin_np, noskin, skin

# Valid data range according to documentation
# https://github.com/xgcm/aerobulk-python/blob/main/source/aerobulk/flux.py
VALID_VALUE_RANGES = {'sst': [270, 320],
                      't_zt': [180, 330],
                      'hum_zt': [0, 0.08],
                      'u_zu': [-50, 50],
                      'v_zu': [-50, 50],
                      'slp': [80000, 110000],
                      'rad_sw': [0, 1500],
                      'rad_lw': [0, 750]
                     }
sst = [] 
t_zt = []                           
hum_zt = [] 
u_zu = []                       
v_zu = [] 
slp = [] 
zt = []                               
zu = []   

data_points = 3

for data_point in range (0, data_points):
    sst.append(float(decimal.Decimal(random.randrange(VALID_VALUE_RANGES['sst'][0]*1000, VALID_VALUE_RANGES['sst'][1]*1000))/1000)) #did *10 and then /10 to get 0.1 precision
    t_zt.append(float(decimal.Decimal(random.randrange(VALID_VALUE_RANGES['t_zt'][0]*1000, VALID_VALUE_RANGES['t_zt'][1]*1000))/1000))  
    #t_zt.append(sst[data_point]+10)  #instead of 10 do a random range of difference between 270-180 and 320-330
    hum_zt.append(float(decimal.Decimal(random.randrange(VALID_VALUE_RANGES['hum_zt'][0]*1000000, VALID_VALUE_RANGES['hum_zt'][1]*1000000))/1000000))
    u_zu.append(float(decimal.Decimal(random.randrange(VALID_VALUE_RANGES['u_zu'][0]*1000, VALID_VALUE_RANGES['u_zu'][1]*1000))/1000))                         
    v_zu.append(float(0)) 
    slp.append(101000.0)
    
    zt.append(float(decimal.Decimal(random.randrange(12*10,19*10)/10))) #use 12 to 19, mirroring other dataset                                
    zu.append(float(decimal.Decimal(random.randrange(15*10,22*10)/10))) #use 15 to 22, mirroring other dataset   

def wrap_da(input:np.ndarray) -> xr.DataArray:
    return xr.DataArray(input, dims={'time':np.arange(len(input))})

algo = 'coare3p6'
ql, qh, taux, tauy, evap = noskin(
    sst=wrap_da(sst),
    t_zt=wrap_da(t_zt), 
    hum_zt=wrap_da(hum_zt), 
    u_zu=wrap_da(u_zu), 
    v_zu=wrap_da(v_zu), 
    slp=wrap_da(slp), 
    algo=algo, 
    zt=wrap_da(zt), 
    zu=wrap_da(zu), 
    niter=10, 
    input_range_check=True
)

If I try to run this copy/pasted, I get the below error!

Answer 16 · 2024-03-20T23:39:41.000Z

I have raised #80 to clarify the input for the range checking. Also very importantly the results in #81 shows that even if this works, it will likely produce the wrong results in your case here! We need further investigate how we could support this out of the box, but for now I think you will have to code a workaround, where you apply the wrapper to each single datapoint of your observation, and then concatenate the results.

HUGE, thank you so much for catching this!!

Answer 17 · 2024-03-20T23:42:52.000Z

Oddly, if I take out the wrap_da on zu and zt, it works!!

import numpy as np

import pandas as pd
import datetime as dt
from datetime import datetime
import xarray as xr

import matplotlib.pyplot as plt
from matplotlib import cm
import matplotlib.dates as mdates
import cartopy.crs as ccrs

import random 
import decimal
import aerobulk
from aerobulk.flux import noskin_np, skin_np, noskin, skin

# Valid data range according to documentation
# https://github.com/xgcm/aerobulk-python/blob/main/source/aerobulk/flux.py
VALID_VALUE_RANGES = {'sst': [270, 320],
                      't_zt': [180, 330],
                      'hum_zt': [0, 0.08],
                      'u_zu': [-50, 50],
                      'v_zu': [-50, 50],
                      'slp': [80000, 110000],
                      'rad_sw': [0, 1500],
                      'rad_lw': [0, 750]
                     }
sst = [] 
t_zt = []                           
hum_zt = [] 
u_zu = []                       
v_zu = [] 
slp = [] 
zt = []                               
zu = []   

data_points = 3

for data_point in range (0, data_points):
    sst.append(float(decimal.Decimal(random.randrange(VALID_VALUE_RANGES['sst'][0]*1000, VALID_VALUE_RANGES['sst'][1]*1000))/1000)) #did *10 and then /10 to get 0.1 precision
    t_zt.append(float(decimal.Decimal(random.randrange(VALID_VALUE_RANGES['t_zt'][0]*1000, VALID_VALUE_RANGES['t_zt'][1]*1000))/1000))  
    #t_zt.append(sst[data_point]+10)  #instead of 10 do a random range of difference between 270-180 and 320-330
    hum_zt.append(float(decimal.Decimal(random.randrange(VALID_VALUE_RANGES['hum_zt'][0]*1000000, VALID_VALUE_RANGES['hum_zt'][1]*1000000))/1000000))
    u_zu.append(float(decimal.Decimal(random.randrange(VALID_VALUE_RANGES['u_zu'][0]*1000, VALID_VALUE_RANGES['u_zu'][1]*1000))/1000))                         
    v_zu.append(float(0)) 
    slp.append(101000.0)
    
    zt.append(float(decimal.Decimal(random.randrange(12*10,19*10)/10))) #use 12 to 19, mirroring other dataset                                
    zu.append(float(decimal.Decimal(random.randrange(15*10,22*10)/10))) #use 15 to 22, mirroring other dataset   

def wrap_da(input:np.ndarray) -> xr.DataArray:
    return xr.DataArray(input, dims={'time':np.arange(len(input))})

algo = 'coare3p6'
ql, qh, taux, tauy, evap = noskin(
    sst=wrap_da(sst),
    t_zt=wrap_da(t_zt), 
    hum_zt=wrap_da(hum_zt), 
    u_zu=wrap_da(u_zu), 
    v_zu=wrap_da(v_zu), 
    slp=wrap_da(slp), 
    algo=algo, 
    zt=zt, 
    zu=zu, 
    niter=10, 
    input_range_check=True
)

Answer 18 · 2024-03-21T18:21:42.000Z

I suspect that either the wrapper or the fortran code will just pick the first value of any iterable (including a list as given here), since these inputs are not manipulated on the xarray/numpy level like the other inputs, this makes sense. I think this will still give you 2 identical output values like in #81 though, right?