Hierarchical MultiTags
plodocus opened this issue · 4 comments
Hi,
I'm currently playing around with nix, so apologies if I missed an obvious solution to my question.
I have two types of continuous data that I would like to be able to reference using one MultiTag.
Concretely, the data are from one session of a trial-sectioned behavioural experiment:
- Continuous electrophysiological data.
- Snippets of continuous movement data. These are only recorded during the trial proper, i.e. there are discontinuities between subsequent trials (thus the concatenated movement data for a session is much shorter than the ephys data).
I'd like to be able to retrieve the ephys and movement data on a trial-basis.
Right now, I have two approaches:
- Create a MultiTag for each kind of data. This is is inconvenient, since I'd have to use two MultiTags.
- Pad/impute the movement data so that it's the same length as the ephys data and use one MultiTag. That seems like an inefficient way to store the data because the inter-trial intervals are often longer than trials themselves.
So would it be possible to use approach 1 but have a "parent" MultiTag or other grouping possibility?
Basically, if "child" MultiTags have the same shape (same extents, same number of positions but offset) retrieve_data
on the "parent" MultiTag would give me the associated data from both kinds of data.
Hi @DanBenHa, our dimension descriptors do not know gaps. Thus we need to take a slightly different approach which nevertheless should be more efficient than padding. But this depends of the pauses. The following code shows how I would probably do it. (This does not mean that there are no other ways ;) I hope I understood your setting correctly.
The "trick" I would suggest is to use a RangeDimension for the movement data and store the timestamps for each data point as "ticks" of the dimension descriptor. I hope this helps.
import nixio as nix
import numpy as np
import matplotlib.pyplot as plt
def create_data():
# create some dummy data
trial_starts = np.arange(0.0, 10.0, 2.5) # s
trial_duration = 2.0 # s shorter than the inter trial interval, i.e. there are pauses
dt = 0.01 # stepsize in s
theta = 6 # Hz
time = np.arange(0.0, trial_starts[-1] + trial_duration + 0.1, dt)
ephys_data = np.random.randn(len(time)) + np.sin(2 * np.pi * theta * time) # is recorded continuously without gaps
movement_data = np.zeros(len(trial_starts) * int(trial_duration / dt)) # shorter since movement only recorded during trials
movement_timestamps = np.zeros_like(movement_data)
for i, trial_start in enumerate(trial_starts):
start_index = int(i * trial_duration / dt)
end_index = int(start_index + trial_duration / dt)
movement_data[start_index:end_index] = i + 2
movement_timestamps[start_index:end_index] = time[(time >= trial_start) & (time < trial_start + trial_duration)]
# store the data
nf = nix.File.open("EphysAndBehaviour.nix", nix.FileMode.Overwrite)
b = nf.create_block("session 1", "nix.recording_session")
ephys_array = b.create_data_array("field potential", "nix.sampled", data=ephys_data, dtype=nix.DataType.Float)
ephys_array.label = "voltage"
ephys_array.unit = "uV"
dim = ephys_array.append_sampled_dimension(dt)
dim.label = "time"
dim.unit = "s"
movement_array = b.create_data_array("movement speed", "nix.irregular_sampled", data=movement_data, dtype=nix.DataType.Float)
movement_array.label = "movement speed"
movement_array.unit = "m/s"
dim = movement_array.append_range_dimension(movement_timestamps)
dim.label = "time"
dim.unit = "s"
start_array = b.create_data_array("trial start times", "nix.events", data=trial_starts)
start_array.append_set_dimension()
extent_array = b.create_data_array("trial extents", "nix.extents", data=np.ones_like(trial_starts) * trial_duration)
extent_array.append_set_dimension()
mtag = b.create_multi_tag("Trials", "nix.segments.trials", start_array)
mtag.extents = extent_array
mtag.references.append(movement_array)
mtag.references.append(ephys_array)
nf.close()
def retrieve_data():
nf = nix.File.open("EphysAndBehaviour.nix", nix.FileMode.ReadOnly)
b = nf.blocks[0]
mtag = b.multi_tags[0]
mv_array = mtag.references["movement speed"]
mv_timestamps = np.array(mv_array.dimensions[0].ticks)
fp_array = mtag.references["field potential"]
fp_data = fp_array[:]
time = np.array(fp_array.dimensions[0].axis(len(fp_data)))
plt.plot(time, fp_data, label="full recording")
starts = mtag.positions[:]
extents = mtag.extents[:]
for i, (start, extent) in enumerate(zip(starts, extents)):
fp_snippet = mtag.retrieve_data(i, "field potential")[:-1] # 1.4 versions include the last element
fp_time = time[(time >= start) & (time < start + extent)]
l = plt.plot(fp_time, fp_snippet)
mv_snippet = mtag.retrieve_data(i, "movement speed")[:]
mv_time = mv_timestamps[(mv_timestamps >= start) & (mv_timestamps < start + extent)]
plt.plot(mv_time, mv_snippet, label="trial: %i" %i, lw=2.0, color=l[0].get_color())
plt.legend()
plt.show()
nf.close()
if __name__ == "__main__":
create_data()
retrieve_data()
I was preparing a similar solution but Jan's is a great demonstration so I wont post it.
Instead, I'll explain a bit about what's going on with the solution.
It is possible to achieve what you want with a single MultiTag if the time dimensions of the two data arrays are properly aligned. The continuous ephys data array should have a dimension descriptor for the time axis which is a SampledDimension (with the appropriate sampling interval). The snippets can be concatenated to a single data array, but their time dimension would then need to be a RangeDimension which contains every time stamp of the data (ticks). So each sample in the data should have a corresponding time stamp in the dimension descriptor.
Then once you create the MultiTag and reference both data arrays, the positions and extents defined will correspond to the time point in each data array. The index this corresponds to in each array depends on the time dimension descriptor. The data retrieval method of the MultiTag uses the information in the dimension of each referenced data array to determine the start and end indexes for the corresponding data.
Thanks to both of you! I was under the impression that MultiTag positions and extents were in the unit of elements/samples but @achilleas-k's explanation clarified that.
Since I have my solution and I don't want it to get lost in my archives, I'll post it here anyway in case it helps at all in the future.
import nixio as nix
import numpy as np
import matplotlib.pyplot as plt
# Create a fake continuous signal
cont_time = np.arange(0, 1.5, 0.001)
cont_data = np.sin(cont_time * 2 * np.pi * 3)
# Create two fake shorter signals that fit within the first signal's time
seg1_time = np.arange(0, 0.3, 0.001)
seg1_data = np.sin(seg1_time * 2 * np.pi * 7) * 0.3
seg2_time = np.arange(0.7, 1, 0.001)
seg2_data = np.sin(seg2_time * 2 * np.pi * 7) * 0.3
# Plot to see what they look like
plt.figure("Original data")
plt.plot(cont_time, cont_data, color="red")
plt.plot(seg1_time, seg1_data, color="black")
plt.plot(seg2_time, seg2_data, color="black")
nf = nix.File.open("example.nix", nix.FileMode.Overwrite)
blk = nf.create_block("example", "example")
# Continuous signal is regularly sampled, so we can define the time axis using just the
# sampling_interval. Let's assume it's in seconds.
cont_da = blk.create_data_array("cont", "continuous data", data=cont_data)
cont_time_dim = cont_da.append_sampled_dimension(sampling_interval=0.001)
cont_time_dim.unit = "s"
cont_time_dim.label = "time"
# Concatenate the two snippets and their time axes to create a single data array with all the data.
seg_data = np.append(seg1_data, seg2_data)
seg_time = np.append(seg1_time, seg2_time)
seg_da = blk.create_data_array("experiments", "segmented data", data=seg_data)
seg_time_dim = seg_da.append_range_dimension(ticks=seg_time)
seg_time_dim.unit = "s"
seg_time_dim.label = "time"
# Create a MultiTag that points to the two snippet times but references both data arrays.
exp_starts = blk.create_data_array("exp start times", "mtag.positions", data=[0, 0.7])
exp_durations = blk.create_data_array("exp durations", "mtag.extents", data=[0.3, 0.3])
exp_tag = blk.create_multi_tag("exp segments", "experiments tag", positions=exp_starts)
exp_tag.extents = exp_durations
# this is important for aligning the tag's positions and extents to the dimensions of the data
# arrays
exp_tag.units = ["s"]
exp_tag.references.append(cont_da)
exp_tag.references.append(seg_da)
# Retrieve the data for the first experiment's duration from each data array
exp1_cont_data = exp_tag.retrieve_data(0, 0)
exp1_seg_data = exp_tag.retrieve_data(0, 1)
# Retrieve the time axes for each.
cont_samples = len(exp1_cont_data)
# We can reuse the dimension above, but let's assume we don't have the reference anymore.
cont_time_dim = exp_tag.references[0].dimensions[0]
# start time index of the first experiment on the continuous data time index
exp1_cont_start = cont_time_dim.index_of(exp_tag.positions[0])
exp1_cont_time = cont_time_dim.axis(count=cont_samples, start=exp1_cont_start)
seg_samples = len(exp1_seg_data)
seg_time_dim = exp_tag.references[1].dimensions[0]
exp1_seg_start = seg_time_dim.index_of(exp_tag.positions[0])
exp1_seg_time = seg_time_dim.axis(count=seg_samples, start=exp1_seg_start)
plt.figure("Tagged data")
plt.plot(exp1_cont_time, exp1_cont_data)
plt.plot(exp1_seg_time, exp1_seg_data)
# Do the same for the second experiment
exp2_cont_data = exp_tag.retrieve_data(1, 0)[:-1]
exp2_seg_data = exp_tag.retrieve_data(1, 1)
cont_samples = len(exp2_cont_data)
cont_time_dim = exp_tag.references[0].dimensions[0]
exp2_cont_start = cont_time_dim.index_of(exp_tag.positions[1])
exp2_cont_time = cont_time_dim.axis(count=cont_samples, start=exp2_cont_start)
seg_samples = len(exp2_seg_data)
seg_time_dim = exp_tag.references[1].dimensions[0]
exp2_seg_start = cont_time_dim.index_of(exp_tag.positions[1])
exp2_seg_time = cont_time_dim.axis(count=cont_samples, start=exp2_seg_start)
plt.plot(exp2_cont_time, exp2_cont_data)
plt.plot(exp2_seg_time, exp2_seg_data)
plt.show()
The figures at the end should look like this: