MLBazaar/MLPrimitives

Improve find_anomalies primitive

AlexanderGeiger opened this issue · 1 comments

Description

Update the find_anomalies primitive according to the issues mentioned in Orion:

Suggestion

Introduce new hyperparameters for the find_anomalies primitive like the following:

"hyperparameters": {
    "fixed": {
        "z_range": {
            "type": "tuple",
            "default": [
                0,
                12
            ]
        },
        "window_size": {
            "type": "int",
            "default": 2000
        },
        "window_step_size": {
            "type": "int",
            "default": 200
        },
        "lower_threshold": {
            "type": "bool",
            "default": false
        }
    },
    "tunable": {
        "min_percent": {
            "type": "float",
            "default": 0.13,
            "range": [
                0.01,
                0.9
            ]
        },
        "anomaly_interval": {
            "type": "int",
            "default": 50,
            "range": [
                0,
                400
            ]
        }
    }
}

Change the find_anomalies function, such that it supports the lower threshold and a flexible step_size for overlapping thresholds.

def find_anomalies(errors, index, z_range=(0, 10), window_size=None, window_step_size=None, min_percent=0.1,
                   anomaly_interval=50, lower_threshold=False):
    """Find sequences of values that are anomalous.
    We first find the ideal threshold for the set of errors that we have,
    and then find the sequences of values that are above this threshold.
    Lastly, we compute a score proportional to the maximum error in the
    sequence, and finally return the index pairs that correspond to
    each sequence, along with its score.
    """

    window_size = window_size or len(errors)
    window_step_size = window_step_size or len(errors)
    window_start = 0
    window_end = 0
    sequences = list()

    while window_end < len(errors):
        window_end = window_start + window_size
        window = errors[window_start:window_end]

        threshold = _find_threshold(window, z_range)
        window_sequences, max_below = _find_sequences(window, threshold, anomaly_interval)
        max_errors = _get_max_errors(window, window_sequences, max_below)
        pruned_anomalies = _prune_anomalies(max_errors, min_percent)
        window_sequences = _compute_scores(pruned_anomalies, errors, threshold, window_start)
        sequences.extend(window_sequences)

        if lower_threshold:
            # Flip errors sequence around mean
            mean = window.mean()
            inverted_window = mean - (window - mean)

            # Perform same procedure as above
            threshold = _find_threshold(inverted_window, z_range)
            window_sequences, max_below = _find_sequences(inverted_window, threshold, anomaly_interval)
            max_errors = _get_max_errors(inverted_window, window_sequences, max_below)
            pruned_anomalies = _prune_anomalies(max_errors, min_percent)
            window_sequences = _compute_scores(pruned_anomalies, errors, threshold, window_start)
            sequences.extend(window_sequences)

        window_start = window_start + window_step_size

    sequences = _merge_sequences(sequences)

    anomalies = list()
    for start, stop, score in sequences:
        anomalies.append([index[int(start)], index[int(stop)], score])

    return np.asarray(anomalies)

The _find_sequences function can be extended to add the area around anomalies to the sequences as well:

def _find_sequences(errors, epsilon, anomaly_interval):
    """Find sequences of values that are above epsilon.

    This is done following this steps:

        * create a boolean mask that indicates which values are above epsilon.
        * shift this mask by one place, filing the empty gap with a False
        * compare the shifted mask with the original one to see if there are changes.
        * Consider a sequence start any point which was true and has changed
        * Consider a sequence end any point which was false and has changed
    """
    above = pd.Series(errors > epsilon)

    # mark area around anomalies also as True
    index_above = np.argwhere(above)
    for i in index_above.flatten():
        above[max(0, i-anomaly_interval):min(i+anomaly_interval+1, len(above))] = True
    shift = above.shift(1).fillna(False)
    change = above != shift

    if above.all():
        max_below = 0
    else:
        max_below = max(errors[~above])

    index = above.index
    starts = index[above & change].tolist()
    ends = (index[~above & change] - 1).tolist()
    if len(ends) == len(starts) - 1:
        ends.append(len(above) - 1)

    return np.array([starts, ends]).T, max_below

Then we can calculate the score of a sequence with this function:

def _compute_scores(pruned_anomalies, errors, threshold, window_start):
    """Compute the score of the anomalies and add window_start timestamp to make the index absolute.
    """
    anomalies = list()
    denominator = errors.mean() + errors.std()

    for row in pruned_anomalies:
        max_error = row[2]
        score = (max_error - threshold) / denominator
        anomalies.append([row[0]+window_start, row[1]+window_start, score])

    return anomalies

The _merge_consecutive function can be changed like this to merge overlapping or consecutive sequences and calculate the anomaly score:

def _merge_sequences(sequences):
    """Merge consecutive and overlapping sequences.
    We iterate over a list of start, end, score triples and merge together
    overlapping or consecutive sequences.
    The score of a merged sequence is the average of the single scores,
    weighted by the length of the corresponding sequences.
    """

    sorted_sequences = sorted(sequences, key=lambda entry: entry[0])
    new_sequences = []
    score = list()
    weights = list()

    for i in sorted_sequences:
        if not new_sequences:
            score.append(i[2])
            weights.append(i[1] - i[0])
            new_sequences.append(i)
        else:
            j = new_sequences[-1]
            if i[0] <= j[1]+1:
                score.append(i[2])
                weights.append(i[1]-i[0])
                weighted_average = np.average(score, weights=weights)
                new_sequences[-1] = (j[0], max(j[1], i[1]), weighted_average)
            else:
                score = [i[2]]
                weights = [i[1]-i[0]]
                new_sequences.append(i)

    return np.array(new_sequences)
csala commented

Closed via #181