from enum import Enum import logging import numpy as np import pandas as pd import math from typing import Optional, Union, List, Tuple, Generator import operator from analytic_types import AnalyticUnitId, ModelCache from analytic_types.detector import DetectionResult, ProcessingResult, Bound from analytic_types.data_bucket import DataBucket from analytic_types.segment import Segment, AnomalyDetectorSegment from analytic_types.cache import AnomalyCache from detectors import Detector, ProcessingDetector import utils MAX_DEPENDENCY_LEVEL = 100 MIN_DEPENDENCY_FACTOR = 0.1 BASIC_ALPHA = 0.5 BUCKET_SIZE = MAX_DEPENDENCY_LEVEL logger = logging.getLogger('ANOMALY_DETECTOR') class AnomalyDetector(ProcessingDetector): def __init__(self, analytic_unit_id: AnalyticUnitId): super().__init__(analytic_unit_id) self.bucket = DataBucket() def train(self, dataframe: pd.DataFrame, payload: Union[list, dict], cache: Optional[ModelCache]) -> ModelCache: cache = AnomalyCache.from_json(payload) cache.time_step = utils.find_interval(dataframe) segments = cache.segments if len(segments) > 0: seasonality = cache.seasonality prepared_segments = [] for segment in segments: segment_len = (int(segment.to_timestamp) - int(segment.from_timestamp)) assert segment_len <= seasonality, \ f'seasonality {seasonality} must be greater than segment length {segment_len}' from_index = utils.timestamp_to_index(dataframe, pd.to_datetime(segment.from_timestamp, unit='ms')) to_index = utils.timestamp_to_index(dataframe, pd.to_datetime(segment.to_timestamp, unit='ms')) segment_data = dataframe[from_index : to_index] prepared_segments.append( AnomalyDetectorSegment( segment.from_timestamp, segment.to_timestamp, segment_data.value.tolist() ) ) cache.set_segments(prepared_segments) return { 'cache': cache.to_json() } # TODO: ModelCache -> DetectorState def detect(self, dataframe: pd.DataFrame, cache: Optional[ModelCache]) -> DetectionResult: if cache == None: raise f'Analytic unit {self.analytic_unit_id} got empty cache' data = dataframe['value'] cache = AnomalyCache.from_json(cache) segments = cache.segments enabled_bounds = cache.get_enabled_bounds() smoothed_data = utils.exponential_smoothing(data, cache.alpha) lower_bound = smoothed_data - cache.confidence upper_bound = smoothed_data + cache.confidence if len(segments) > 0: data_start_time = utils.convert_pd_timestamp_to_ms(dataframe['timestamp'][0]) for segment in segments: seasonality_index = cache.seasonality // cache.time_step seasonality_offset = self.get_seasonality_offset( segment.from_timestamp, cache.seasonality, data_start_time, cache.time_step ) segment_data = pd.Series(segment.data) lower_bound = self.add_season_to_data(lower_bound, segment_data, seasonality_offset, seasonality_index, Bound.LOWER) upper_bound = self.add_season_to_data(upper_bound, segment_data, seasonality_offset, seasonality_index, Bound.UPPER) detected_segments = list(self.detections_generator(dataframe, upper_bound, lower_bound, enabled_bounds)) last_dataframe_time = dataframe.iloc[-1]['timestamp'] last_detection_time = utils.convert_pd_timestamp_to_ms(last_dataframe_time) return DetectionResult(cache.to_json(), detected_segments, last_detection_time) def consume_data(self, data: pd.DataFrame, cache: Optional[ModelCache]) -> Optional[DetectionResult]: if cache is None: msg = f'consume_data got invalid cache {cache} for task {self.analytic_unit_id}' logging.debug(msg) raise ValueError(msg) data_without_nan = data.dropna() if len(data_without_nan) == 0: return None window_size = self.get_window_size(cache) self.bucket.set_max_size(BUCKET_SIZE) self.bucket.append_data(data_without_nan) if self.bucket.get_current_size() >= window_size: return self.detect(self.bucket.data, cache) return None def is_detection_intersected(self) -> bool: return False def get_window_size(self, cache: Optional[ModelCache]) -> int: ''' get the number of values that will affect the next value ''' if cache is None: raise ValueError('anomaly detector got None cache') cache = AnomalyCache.from_json(cache) for level in range(1, MAX_DEPENDENCY_LEVEL): if (1 - cache.alpha) ** level < MIN_DEPENDENCY_FACTOR: break seasonality = 0 if len(cache.segments) > 0: seasonality = cache.seasonality // cache.time_step return max(level, seasonality) def concat_detection_results(self, detections: List[DetectionResult]) -> DetectionResult: result = DetectionResult() time_step = detections[0].cache['timeStep'] for detection in detections: result.segments.extend(detection.segments) result.last_detection_time = detection.last_detection_time result.cache = detection.cache result.segments = utils.merge_intersecting_segments(result.segments, time_step) return result # TODO: remove duplication with detect() def process_data(self, dataframe: pd.DataFrame, cache: ModelCache) -> ProcessingResult: cache = AnomalyCache.from_json(cache) segments = cache.segments enabled_bounds = cache.get_enabled_bounds() # TODO: exponential_smoothing should return dataframe with related timestamps smoothed_data = utils.exponential_smoothing(dataframe['value'], cache.alpha) lower_bound = smoothed_data - cache.confidence upper_bound = smoothed_data + cache.confidence if len(segments) > 0: data_start_time = utils.convert_pd_timestamp_to_ms(dataframe['timestamp'][0]) for segment in segments: seasonality_index = cache.seasonality // cache.time_step # TODO: move it to utils and add tests seasonality_offset = self.get_seasonality_offset( segment.from_timestamp, cache.seasonality, data_start_time, cache.time_step ) segment_data = pd.Series(segment.data) lower_bound = self.add_season_to_data(lower_bound, segment_data, seasonality_offset, seasonality_index, Bound.LOWER) upper_bound = self.add_season_to_data(upper_bound, segment_data, seasonality_offset, seasonality_index, Bound.UPPER) # TODO: support multiple segments timestamps = utils.convert_series_to_timestamp_list(dataframe.timestamp) lower_bound_timeseries = list(zip(timestamps, lower_bound.values.tolist())) upper_bound_timeseries = list(zip(timestamps, upper_bound.values.tolist())) if enabled_bounds == Bound.ALL: return ProcessingResult(lower_bound_timeseries, upper_bound_timeseries) elif enabled_bounds == Bound.UPPER: return ProcessingResult(upper_bound = upper_bound_timeseries) elif enabled_bounds == Bound.LOWER: return ProcessingResult(lower_bound = lower_bound_timeseries) def add_season_to_data(self, data: pd.Series, segment: pd.Series, offset: int, seasonality: int, bound_type: Bound) -> pd.Series: #data - smoothed data to which seasonality will be added #if addition == True -> segment is added #if addition == False -> segment is subtracted len_smoothed_data = len(data) for idx, _ in enumerate(data): if idx - offset < 0: #TODO: add seasonality for non empty parts continue if (idx - offset) % seasonality == 0: if bound_type == Bound.UPPER: upper_segment_bound = self.get_segment_bound(segment, Bound.UPPER) data = data.add(pd.Series(upper_segment_bound.values, index = segment.index + idx), fill_value = 0) elif bound_type == Bound.LOWER: lower_segment_bound = self.get_segment_bound(segment, Bound.LOWER) data = data.add(pd.Series(lower_segment_bound.values * -1, index = segment.index + idx), fill_value = 0) else: raise ValueError(f'unknown bound type: {bound_type.value}') return data[:len_smoothed_data] def get_segment_bound(self, segment: pd.Series, bound: Bound) -> pd.Series: ''' segment is divided by the median to determine its top or bottom part the part is smoothed and raised above the segment or put down below the segment ''' if len(segment) < 2: return segment comparison_operator = operator.gt if bound == Bound.UPPER else operator.le segment = segment - segment.min() segment_median = segment.median() part = [val if comparison_operator(val, segment_median) else segment_median for val in segment.values] part = pd.Series(part, index = segment.index) smoothed_part = utils.exponential_smoothing(part, BASIC_ALPHA) difference = [abs(x - y) for x, y in zip(part, smoothed_part)] max_diff = max(difference) bound = [val + max_diff for val in smoothed_part.values] bound = pd.Series(bound, index = segment.index) return bound def get_seasonality_offset(self, from_timestamp: int, seasonality: int, data_start_time: int, time_step: int) -> int: season_count = math.ceil(abs(from_timestamp - data_start_time) / seasonality) start_seasonal_segment = from_timestamp + seasonality * season_count seasonality_time_offset = abs(start_seasonal_segment - data_start_time) % seasonality seasonality_offset = math.ceil(seasonality_time_offset / time_step) return seasonality_offset def detections_generator( self, dataframe: pd.DataFrame, upper_bound: pd.DataFrame, lower_bound: pd.DataFrame, enabled_bounds: Bound ) -> Generator[Segment, None, Segment]: in_segment = False segment_start = 0 bound: Bound = None for idx, val in enumerate(dataframe['value'].values): if val > upper_bound.values[idx]: if enabled_bounds == Bound.UPPER or enabled_bounds == Bound.ALL: if not in_segment: in_segment = True segment_start = dataframe['timestamp'][idx] bound = Bound.UPPER continue if val < lower_bound.values[idx]: if enabled_bounds == Bound.LOWER or enabled_bounds == Bound.ALL: if not in_segment: in_segment = True segment_start = dataframe['timestamp'][idx] bound = Bound.LOWER continue if in_segment: segment_end = dataframe['timestamp'][idx - 1] yield Segment( utils.convert_pd_timestamp_to_ms(segment_start), utils.convert_pd_timestamp_to_ms(segment_end), # TODO: configurable decimals number message=f'{val:.2f} out of {str(bound.value)} bound' ) in_segment = False else: if in_segment: segment_end = dataframe['timestamp'][idx] return Segment( utils.convert_pd_timestamp_to_ms(segment_start), utils.convert_pd_timestamp_to_ms(segment_end), message=f'{val:.2f} out of {str(bound.value)} bound' )