Improve find_anomalies primitive
AlexanderGeiger opened this issue · 1 comments
AlexanderGeiger commented
Description
Update the find_anomalies primitive according to the issues mentioned in Orion:
- Add overlapping threshold
- Add anomalous area around an anomaly
- Threshold for unusually low errors
- Calculate anomaly score for overlapping or consecutive anomalies
Suggestion
Introduce new hyperparameters for the find_anomalies primitive like the following:
"hyperparameters": {
"fixed": {
"z_range": {
"type": "tuple",
"default": [
0,
12
]
},
"window_size": {
"type": "int",
"default": 2000
},
"window_step_size": {
"type": "int",
"default": 200
},
"lower_threshold": {
"type": "bool",
"default": false
}
},
"tunable": {
"min_percent": {
"type": "float",
"default": 0.13,
"range": [
0.01,
0.9
]
},
"anomaly_interval": {
"type": "int",
"default": 50,
"range": [
0,
400
]
}
}
}
Change the find_anomalies function, such that it supports the lower threshold and a flexible step_size for overlapping thresholds.
def find_anomalies(errors, index, z_range=(0, 10), window_size=None, window_step_size=None, min_percent=0.1,
anomaly_interval=50, lower_threshold=False):
"""Find sequences of values that are anomalous.
We first find the ideal threshold for the set of errors that we have,
and then find the sequences of values that are above this threshold.
Lastly, we compute a score proportional to the maximum error in the
sequence, and finally return the index pairs that correspond to
each sequence, along with its score.
"""
window_size = window_size or len(errors)
window_step_size = window_step_size or len(errors)
window_start = 0
window_end = 0
sequences = list()
while window_end < len(errors):
window_end = window_start + window_size
window = errors[window_start:window_end]
threshold = _find_threshold(window, z_range)
window_sequences, max_below = _find_sequences(window, threshold, anomaly_interval)
max_errors = _get_max_errors(window, window_sequences, max_below)
pruned_anomalies = _prune_anomalies(max_errors, min_percent)
window_sequences = _compute_scores(pruned_anomalies, errors, threshold, window_start)
sequences.extend(window_sequences)
if lower_threshold:
# Flip errors sequence around mean
mean = window.mean()
inverted_window = mean - (window - mean)
# Perform same procedure as above
threshold = _find_threshold(inverted_window, z_range)
window_sequences, max_below = _find_sequences(inverted_window, threshold, anomaly_interval)
max_errors = _get_max_errors(inverted_window, window_sequences, max_below)
pruned_anomalies = _prune_anomalies(max_errors, min_percent)
window_sequences = _compute_scores(pruned_anomalies, errors, threshold, window_start)
sequences.extend(window_sequences)
window_start = window_start + window_step_size
sequences = _merge_sequences(sequences)
anomalies = list()
for start, stop, score in sequences:
anomalies.append([index[int(start)], index[int(stop)], score])
return np.asarray(anomalies)
The _find_sequences function can be extended to add the area around anomalies to the sequences as well:
def _find_sequences(errors, epsilon, anomaly_interval):
"""Find sequences of values that are above epsilon.
This is done following this steps:
* create a boolean mask that indicates which values are above epsilon.
* shift this mask by one place, filing the empty gap with a False
* compare the shifted mask with the original one to see if there are changes.
* Consider a sequence start any point which was true and has changed
* Consider a sequence end any point which was false and has changed
"""
above = pd.Series(errors > epsilon)
# mark area around anomalies also as True
index_above = np.argwhere(above)
for i in index_above.flatten():
above[max(0, i-anomaly_interval):min(i+anomaly_interval+1, len(above))] = True
shift = above.shift(1).fillna(False)
change = above != shift
if above.all():
max_below = 0
else:
max_below = max(errors[~above])
index = above.index
starts = index[above & change].tolist()
ends = (index[~above & change] - 1).tolist()
if len(ends) == len(starts) - 1:
ends.append(len(above) - 1)
return np.array([starts, ends]).T, max_below
Then we can calculate the score of a sequence with this function:
def _compute_scores(pruned_anomalies, errors, threshold, window_start):
"""Compute the score of the anomalies and add window_start timestamp to make the index absolute.
"""
anomalies = list()
denominator = errors.mean() + errors.std()
for row in pruned_anomalies:
max_error = row[2]
score = (max_error - threshold) / denominator
anomalies.append([row[0]+window_start, row[1]+window_start, score])
return anomalies
The _merge_consecutive function can be changed like this to merge overlapping or consecutive sequences and calculate the anomaly score:
def _merge_sequences(sequences):
"""Merge consecutive and overlapping sequences.
We iterate over a list of start, end, score triples and merge together
overlapping or consecutive sequences.
The score of a merged sequence is the average of the single scores,
weighted by the length of the corresponding sequences.
"""
sorted_sequences = sorted(sequences, key=lambda entry: entry[0])
new_sequences = []
score = list()
weights = list()
for i in sorted_sequences:
if not new_sequences:
score.append(i[2])
weights.append(i[1] - i[0])
new_sequences.append(i)
else:
j = new_sequences[-1]
if i[0] <= j[1]+1:
score.append(i[2])
weights.append(i[1]-i[0])
weighted_average = np.average(score, weights=weights)
new_sequences[-1] = (j[0], max(j[1], i[1]), weighted_average)
else:
score = [i[2]]
weights = [i[1]-i[0]]
new_sequences.append(i)
return np.array(new_sequences)