analytics/analytics/detectors/anomaly_detector.py

from enum import Enum
import logging
import numpy as np
import pandas as pd
import math
from typing import Optional, Union, List, Tuple, Generator
import operator

from analytic_types import AnalyticUnitId, ModelCache
from analytic_types.detector import DetectionResult, ProcessingResult, Bound
from analytic_types.data_bucket import DataBucket
from analytic_types.segment import Segment, AnomalyDetectorSegment
from analytic_types.cache import AnomalyCache
from detectors import Detector, ProcessingDetector
import utils

MAX_DEPENDENCY_LEVEL = 100
MIN_DEPENDENCY_FACTOR = 0.1
BASIC_ALPHA = 0.5

BUCKET_SIZE = MAX_DEPENDENCY_LEVEL

logger = logging.getLogger('ANOMALY_DETECTOR')


class AnomalyDetector(ProcessingDetector):

    def __init__(self, analytic_unit_id: AnalyticUnitId):
        super().__init__(analytic_unit_id)
        self.bucket = DataBucket()

    def train(self, dataframe: pd.DataFrame, payload: Union[list, dict], cache: Optional[ModelCache]) -> ModelCache:
        cache = AnomalyCache.from_json(payload)
        cache.time_step = utils.find_interval(dataframe)
        segments = cache.segments

        if len(segments) > 0:
            seasonality = cache.seasonality
            prepared_segments = []

            for segment in segments:
                segment_len = (int(segment.to_timestamp) - int(segment.from_timestamp))
                assert segment_len <= seasonality, \
                    f'seasonality {seasonality} must be greater than segment length {segment_len}'

                from_index = utils.timestamp_to_index(dataframe, pd.to_datetime(segment.from_timestamp, unit='ms'))
                to_index = utils.timestamp_to_index(dataframe, pd.to_datetime(segment.to_timestamp, unit='ms'))
                segment_data = dataframe[from_index : to_index]
                prepared_segments.append(
                    AnomalyDetectorSegment(
                        segment.from_timestamp,
                        segment.to_timestamp,
                        segment_data.value.tolist()
                    )
                )
            cache.set_segments(prepared_segments)

        return {
            'cache': cache.to_json()
        }

    # TODO: ModelCache -> DetectorState
    def detect(self, dataframe: pd.DataFrame, cache: Optional[ModelCache]) -> DetectionResult:
        if cache == None:
            raise f'Analytic unit {self.analytic_unit_id} got empty cache'
        data = dataframe['value']

        cache = AnomalyCache.from_json(cache)
        segments = cache.segments
        enabled_bounds = cache.get_enabled_bounds()

        smoothed_data = utils.exponential_smoothing(data, cache.alpha)

        lower_bound = smoothed_data - cache.confidence
        upper_bound = smoothed_data + cache.confidence

        if len(segments) > 0:
            data_start_time = utils.convert_pd_timestamp_to_ms(dataframe['timestamp'][0])

            for segment in segments:
                seasonality_index = cache.seasonality // cache.time_step
                seasonality_offset = self.get_seasonality_offset(
                    segment.from_timestamp,
                    cache.seasonality,
                    data_start_time,
                    cache.time_step
                )
                segment_data = pd.Series(segment.data)

                lower_bound = self.add_season_to_data(lower_bound, segment_data, seasonality_offset, seasonality_index, Bound.LOWER)
                upper_bound = self.add_season_to_data(upper_bound, segment_data, seasonality_offset, seasonality_index, Bound.UPPER)

        detected_segments = list(self.detections_generator(dataframe, upper_bound, lower_bound, enabled_bounds))

        last_dataframe_time = dataframe.iloc[-1]['timestamp']
        last_detection_time = utils.convert_pd_timestamp_to_ms(last_dataframe_time)

        return DetectionResult(cache.to_json(), detected_segments, last_detection_time)

    def consume_data(self, data: pd.DataFrame, cache: Optional[ModelCache]) -> Optional[DetectionResult]:
        if cache is None:
            msg = f'consume_data got invalid cache {cache} for task {self.analytic_unit_id}'
            logging.debug(msg)
            raise ValueError(msg)

        data_without_nan = data.dropna()

        if len(data_without_nan) == 0:
            return None

        window_size = self.get_window_size(cache)

        self.bucket.set_max_size(BUCKET_SIZE)
        self.bucket.append_data(data_without_nan)

        if self.bucket.get_current_size() >= window_size:
            return self.detect(self.bucket.data, cache)

        return None

    def is_detection_intersected(self) -> bool:
        return False

    def get_window_size(self, cache: Optional[ModelCache]) -> int:
        '''
        get the number of values that will affect the next value
        '''

        if cache is None:
            raise ValueError('anomaly detector got None cache')
        cache = AnomalyCache.from_json(cache)

        for level in range(1, MAX_DEPENDENCY_LEVEL):
            if (1 - cache.alpha) ** level < MIN_DEPENDENCY_FACTOR:
                break

        seasonality = 0
        if len(cache.segments) > 0:
            seasonality = cache.seasonality // cache.time_step
        return max(level, seasonality)

    def concat_detection_results(self, detections: List[DetectionResult]) -> DetectionResult:
        result = DetectionResult()
        time_step = detections[0].cache['timeStep']
        for detection in detections:
            result.segments.extend(detection.segments)
            result.last_detection_time = detection.last_detection_time
            result.cache = detection.cache
        result.segments = utils.merge_intersecting_segments(result.segments, time_step)
        return result

    # TODO: remove duplication with detect()
    def process_data(self, dataframe: pd.DataFrame, cache: ModelCache) -> ProcessingResult:
        cache = AnomalyCache.from_json(cache)
        segments = cache.segments
        enabled_bounds =  cache.get_enabled_bounds()

        # TODO: exponential_smoothing should return dataframe with related timestamps
        smoothed_data = utils.exponential_smoothing(dataframe['value'], cache.alpha)

        lower_bound = smoothed_data - cache.confidence
        upper_bound = smoothed_data + cache.confidence

        if len(segments) > 0:
            data_start_time = utils.convert_pd_timestamp_to_ms(dataframe['timestamp'][0])

            for segment in segments:
                seasonality_index = cache.seasonality // cache.time_step
                # TODO: move it to utils and add tests
                seasonality_offset = self.get_seasonality_offset(
                    segment.from_timestamp,
                    cache.seasonality,
                    data_start_time,
                    cache.time_step
                )
                segment_data = pd.Series(segment.data)

                lower_bound = self.add_season_to_data(lower_bound, segment_data, seasonality_offset, seasonality_index, Bound.LOWER)
                upper_bound = self.add_season_to_data(upper_bound, segment_data, seasonality_offset, seasonality_index, Bound.UPPER)

                # TODO: support multiple segments

        timestamps = utils.convert_series_to_timestamp_list(dataframe.timestamp)
        lower_bound_timeseries = list(zip(timestamps, lower_bound.values.tolist()))
        upper_bound_timeseries = list(zip(timestamps, upper_bound.values.tolist()))

        if enabled_bounds == Bound.ALL:
            return ProcessingResult(lower_bound_timeseries, upper_bound_timeseries)
        elif enabled_bounds == Bound.UPPER:
            return ProcessingResult(upper_bound = upper_bound_timeseries)
        elif enabled_bounds == Bound.LOWER:
            return ProcessingResult(lower_bound = lower_bound_timeseries)

    def add_season_to_data(self, data: pd.Series, segment: pd.Series, offset: int, seasonality: int, bound_type: Bound) -> pd.Series:
        #data - smoothed data to which seasonality will be added
        #if addition == True -> segment is added
        #if addition == False -> segment is subtracted
        len_smoothed_data = len(data)
        for idx, _ in enumerate(data):
            if idx - offset < 0:
                #TODO: add seasonality for non empty parts
                continue
            if (idx - offset) % seasonality == 0:
                if bound_type == Bound.UPPER:
                    upper_segment_bound = self.get_segment_bound(segment, Bound.UPPER)
                    data = data.add(pd.Series(upper_segment_bound.values, index = segment.index + idx), fill_value = 0)
                elif bound_type == Bound.LOWER:
                    lower_segment_bound = self.get_segment_bound(segment, Bound.LOWER)
                    data = data.add(pd.Series(lower_segment_bound.values * -1, index = segment.index + idx), fill_value = 0)
                else:
                    raise ValueError(f'unknown bound type: {bound_type.value}')

        return data[:len_smoothed_data]

    def get_segment_bound(self, segment: pd.Series, bound: Bound) -> pd.Series:
        '''
        segment is divided by the median to determine its top or bottom part
        the part is smoothed and raised above the segment or put down below the segment
        '''
        if len(segment) < 2:
            return segment
        comparison_operator = operator.gt if bound == Bound.UPPER else operator.le
        segment = segment - segment.min()
        segment_median = segment.median()
        part = [val if comparison_operator(val, segment_median) else segment_median for val in segment.values]
        part = pd.Series(part, index = segment.index)
        smoothed_part = utils.exponential_smoothing(part, BASIC_ALPHA)
        difference = [abs(x - y) for x, y in zip(part, smoothed_part)]
        max_diff = max(difference)
        bound = [val + max_diff for val in smoothed_part.values]
        bound = pd.Series(bound, index = segment.index)
        return bound

    def get_seasonality_offset(self, from_timestamp: int, seasonality: int, data_start_time: int, time_step: int) -> int:
        season_count = math.ceil(abs(from_timestamp - data_start_time) / seasonality)
        start_seasonal_segment = from_timestamp + seasonality * season_count
        seasonality_time_offset = abs(start_seasonal_segment - data_start_time) % seasonality
        seasonality_offset = math.ceil(seasonality_time_offset / time_step)
        return seasonality_offset

    def detections_generator(
        self,
        dataframe: pd.DataFrame,
        upper_bound: pd.DataFrame,
        lower_bound: pd.DataFrame,
        enabled_bounds: Bound
    ) -> Generator[Segment, None, Segment]:
        in_segment = False
        segment_start = 0
        bound: Bound = None
        for idx, val in enumerate(dataframe['value'].values):
            if val > upper_bound.values[idx]:
                if enabled_bounds == Bound.UPPER or enabled_bounds == Bound.ALL:
                    if not in_segment:
                        in_segment = True
                        segment_start = dataframe['timestamp'][idx]
                        bound = Bound.UPPER
                    continue

            if val < lower_bound.values[idx]:
                if enabled_bounds == Bound.LOWER or enabled_bounds == Bound.ALL:
                    if not in_segment:
                        in_segment = True
                        segment_start = dataframe['timestamp'][idx]
                        bound = Bound.LOWER
                    continue

            if in_segment:
                segment_end = dataframe['timestamp'][idx - 1]
                yield Segment(
                    utils.convert_pd_timestamp_to_ms(segment_start),
                    utils.convert_pd_timestamp_to_ms(segment_end),
                    # TODO: configurable decimals number
                    message=f'{val:.2f} out of {str(bound.value)} bound'
                )
                in_segment = False
        else:
            if in_segment:
                segment_end = dataframe['timestamp'][idx]
                return Segment(
                    utils.convert_pd_timestamp_to_ms(segment_start),
                    utils.convert_pd_timestamp_to_ms(segment_end),
                    message=f'{val:.2f} out of {str(bound.value)} bound'
                )