|
|
|
@ -1,14 +1,12 @@
|
|
|
|
|
from enum import Enum |
|
|
|
|
import logging |
|
|
|
|
import numpy as np |
|
|
|
|
import operator |
|
|
|
|
from collections import OrderedDict |
|
|
|
|
import pandas as pd |
|
|
|
|
import math |
|
|
|
|
from typing import Optional, Union, List, Tuple |
|
|
|
|
|
|
|
|
|
from analytic_types import AnalyticUnitId, ModelCache |
|
|
|
|
from analytic_types.detector_typing import DetectionResult, AnomalyProcessingResult |
|
|
|
|
from analytic_types.detector_typing import DetectionResult, ProcessingResult |
|
|
|
|
from analytic_types.data_bucket import DataBucket |
|
|
|
|
from analytic_types.segment import Segment |
|
|
|
|
from detectors import Detector, ProcessingDetector |
|
|
|
@ -27,12 +25,12 @@ class Bound(Enum):
|
|
|
|
|
class AnomalyDetector(ProcessingDetector): |
|
|
|
|
|
|
|
|
|
def __init__(self, analytic_unit_id: AnalyticUnitId): |
|
|
|
|
self.analytic_unit_id = analytic_unit_id |
|
|
|
|
super().__init__(analytic_unit_id) |
|
|
|
|
self.bucket = DataBucket() |
|
|
|
|
|
|
|
|
|
def train(self, dataframe: pd.DataFrame, payload: Union[list, dict], cache: Optional[ModelCache]) -> ModelCache: |
|
|
|
|
segments = payload.get('segments') |
|
|
|
|
enable_bounds: str = payload.get('enableBounds') or 'ALL' |
|
|
|
|
enable_bounds = Bound(payload.get('enableBounds') or 'ALL') |
|
|
|
|
prepared_segments = [] |
|
|
|
|
time_step = utils.find_interval(dataframe) |
|
|
|
|
|
|
|
|
@ -40,7 +38,7 @@ class AnomalyDetector(ProcessingDetector):
|
|
|
|
|
'confidence': payload['confidence'], |
|
|
|
|
'alpha': payload['alpha'], |
|
|
|
|
'timeStep': time_step, |
|
|
|
|
'enableBounds': enable_bounds |
|
|
|
|
'enableBounds': enable_bounds.value |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if segments is not None: |
|
|
|
@ -65,55 +63,53 @@ class AnomalyDetector(ProcessingDetector):
|
|
|
|
|
'cache': new_cache |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
# TODO: ModelCache -> ModelState |
|
|
|
|
# TODO: ModelCache -> DetectorState |
|
|
|
|
def detect(self, dataframe: pd.DataFrame, cache: Optional[ModelCache]) -> DetectionResult: |
|
|
|
|
if cache == None: |
|
|
|
|
raise f'Analytic unit {self.analytic_unit_id} got empty cache' |
|
|
|
|
data = dataframe['value'] |
|
|
|
|
time_step = cache['timeStep'] |
|
|
|
|
segments = cache.get('segments') |
|
|
|
|
enable_bounds: str = cache.get('enableBounds') or 'ALL' |
|
|
|
|
|
|
|
|
|
smoothed_data = utils.exponential_smoothing(data, cache['alpha']) |
|
|
|
|
|
|
|
|
|
# TODO: use class for cache to avoid using string literals and Bound.TYPE.value |
|
|
|
|
bounds = OrderedDict() |
|
|
|
|
bounds[Bound.LOWER.value] = ( smoothed_data - cache['confidence'], operator.lt ) |
|
|
|
|
bounds[Bound.UPPER.value] = ( smoothed_data + cache['confidence'], operator.gt ) |
|
|
|
|
# TODO: use class for cache to avoid using string literals |
|
|
|
|
alpha = self.get_value_from_cache(cache, 'alpha', required = True) |
|
|
|
|
confidence = self.get_value_from_cache(cache, 'confidence', required = True) |
|
|
|
|
segments = self.get_value_from_cache(cache, 'segments') |
|
|
|
|
enable_bounds = Bound(self.get_value_from_cache(cache, 'enableBounds') or 'ALL') |
|
|
|
|
|
|
|
|
|
if enable_bounds == Bound.LOWER.value: |
|
|
|
|
del bounds[Bound.UPPER.value] |
|
|
|
|
|
|
|
|
|
if enable_bounds == Bound.UPPER.value: |
|
|
|
|
del bounds[Bound.LOWER.value] |
|
|
|
|
smoothed_data = utils.exponential_smoothing(data, alpha) |
|
|
|
|
|
|
|
|
|
lower_bound = smoothed_data - confidence |
|
|
|
|
upper_bound = smoothed_data + confidence |
|
|
|
|
|
|
|
|
|
if segments is not None: |
|
|
|
|
|
|
|
|
|
seasonality = cache.get('seasonality') |
|
|
|
|
assert seasonality is not None and seasonality > 0, \ |
|
|
|
|
time_step = self.get_value_from_cache(cache, 'timeStep', required = True) |
|
|
|
|
seasonality = self.get_value_from_cache(cache, 'seasonality', required = True) |
|
|
|
|
assert seasonality > 0, \ |
|
|
|
|
f'{self.analytic_unit_id} got invalid seasonality {seasonality}' |
|
|
|
|
|
|
|
|
|
data_start_time = utils.convert_pd_timestamp_to_ms(dataframe['timestamp'][0]) |
|
|
|
|
data_second_time = utils.convert_pd_timestamp_to_ms(dataframe['timestamp'][1]) |
|
|
|
|
|
|
|
|
|
for segment in segments: |
|
|
|
|
seasonality_index = seasonality // time_step |
|
|
|
|
season_count = math.ceil(abs(segment['from'] - data_start_time) / seasonality) |
|
|
|
|
start_seasonal_segment = segment['from'] + seasonality * season_count |
|
|
|
|
seasonality_offset = (abs(start_seasonal_segment - data_start_time) % seasonality) // time_step |
|
|
|
|
#TODO: upper and lower bounds for segment_data |
|
|
|
|
|
|
|
|
|
segment_data = pd.Series(segment['data']) |
|
|
|
|
for bound_type, bound_data in bounds.items(): |
|
|
|
|
bound_data, _ = bound_data |
|
|
|
|
bounds[bound_type] = self.add_season_to_data(bound_data, segment_data, seasonality_offset, seasonality_index, bound_type) |
|
|
|
|
assert len(smoothed_data) == len(bounds[bound_type]), \ |
|
|
|
|
f'len smoothed {len(smoothed_data)} != len seasonality {len(bounds[bound_type])}' |
|
|
|
|
|
|
|
|
|
lower_bound = self.add_season_to_data(lower_bound, segment_data, seasonality_offset, seasonality_index, Bound.LOWER) |
|
|
|
|
upper_bound = self.add_season_to_data(upper_bound, segment_data, seasonality_offset, seasonality_index, Bound.UPPER) |
|
|
|
|
|
|
|
|
|
anomaly_indexes = [] |
|
|
|
|
for idx, val in enumerate(data.values): |
|
|
|
|
for bound_type, bound_data in bounds.items(): |
|
|
|
|
bound_data, comparator = bound_data |
|
|
|
|
if comparator(val, bound_data.values[idx]): |
|
|
|
|
if val > upper_bound.values[idx]: |
|
|
|
|
if enable_bounds == Bound.UPPER or enable_bounds == Bound.ALL: |
|
|
|
|
anomaly_indexes.append(data.index[idx]) |
|
|
|
|
|
|
|
|
|
if val < lower_bound.values[idx]: |
|
|
|
|
if enable_bounds == Bound.LOWER or enable_bounds == Bound.ALL: |
|
|
|
|
anomaly_indexes.append(data.index[idx]) |
|
|
|
|
|
|
|
|
|
# TODO: use Segment in utils |
|
|
|
|
segments = utils.close_filtering(anomaly_indexes, 1) |
|
|
|
|
segments = utils.get_start_and_end_of_segments(segments) |
|
|
|
@ -176,34 +172,27 @@ class AnomalyDetector(ProcessingDetector):
|
|
|
|
|
result.segments = utils.merge_intersecting_segments(result.segments, time_step) |
|
|
|
|
return result |
|
|
|
|
|
|
|
|
|
# TODO: ModelCache -> ModelState (don't use string literals) |
|
|
|
|
def process_data(self, dataframe: pd.DataFrame, cache: ModelCache) -> AnomalyProcessingResult: |
|
|
|
|
segments = cache.get('segments') |
|
|
|
|
enable_bounds: str = cache.get('enableBounds') or 'ALL' |
|
|
|
|
# TODO: remove duplication with detect() |
|
|
|
|
def process_data(self, dataframe: pd.DataFrame, cache: ModelCache) -> ProcessingResult: |
|
|
|
|
segments = self.get_value_from_cache(cache, 'segments') |
|
|
|
|
alpha = self.get_value_from_cache(cache, 'alpha', required = True) |
|
|
|
|
confidence = self.get_value_from_cache(cache, 'confidence', required = True) |
|
|
|
|
enable_bounds = Bound(self.get_value_from_cache(cache, 'enableBounds') or 'ALL') |
|
|
|
|
|
|
|
|
|
# TODO: exponential_smoothing should return dataframe with related timestamps |
|
|
|
|
smoothed_data = utils.exponential_smoothing(dataframe['value'], cache['alpha']) |
|
|
|
|
|
|
|
|
|
bounds = OrderedDict() |
|
|
|
|
bounds[Bound.LOWER.value] = smoothed_data - cache['confidence'] |
|
|
|
|
bounds[Bound.UPPER.value] = smoothed_data + cache['confidence'] |
|
|
|
|
|
|
|
|
|
if enable_bounds == Bound.LOWER.value: |
|
|
|
|
del bounds[Bound.UPPER.value] |
|
|
|
|
|
|
|
|
|
if enable_bounds == Bound.UPPER.value: |
|
|
|
|
del bounds[Bound.LOWER.value] |
|
|
|
|
|
|
|
|
|
smoothed_data = utils.exponential_smoothing(dataframe['value'], alpha) |
|
|
|
|
|
|
|
|
|
# TODO: remove duplication with detect() |
|
|
|
|
lower_bound = smoothed_data - confidence |
|
|
|
|
upper_bound = smoothed_data + confidence |
|
|
|
|
|
|
|
|
|
if segments is not None: |
|
|
|
|
seasonality = cache.get('seasonality') |
|
|
|
|
assert seasonality is not None and seasonality > 0, \ |
|
|
|
|
seasonality = self.get_value_from_cache(cache, 'seasonality', required = True) |
|
|
|
|
assert seasonality > 0, \ |
|
|
|
|
f'{self.analytic_unit_id} got invalid seasonality {seasonality}' |
|
|
|
|
|
|
|
|
|
data_start_time = utils.convert_pd_timestamp_to_ms(dataframe['timestamp'][0]) |
|
|
|
|
time_step = cache['timeStep'] |
|
|
|
|
|
|
|
|
|
time_step = self.get_value_from_cache(cache, 'timeStep', required = True) |
|
|
|
|
|
|
|
|
|
for segment in segments: |
|
|
|
|
seasonality_index = seasonality // time_step |
|
|
|
@ -212,19 +201,22 @@ class AnomalyDetector(ProcessingDetector):
|
|
|
|
|
start_seasonal_segment = segment['from'] + seasonality * season_count |
|
|
|
|
seasonality_offset = (abs(start_seasonal_segment - data_start_time) % seasonality) // time_step |
|
|
|
|
segment_data = pd.Series(segment['data']) |
|
|
|
|
for bound_type, bound_data in bounds.items(): |
|
|
|
|
bounds[bound_type] = self.add_season_to_data(bound_data, segment_data, seasonality_offset, seasonality_index, bound_type) |
|
|
|
|
assert len(smoothed_data) == len(bounds[bound_type]), \ |
|
|
|
|
f'len smoothed {len(smoothed_data)} != len seasonality {len(bounds[bound_type])}' |
|
|
|
|
|
|
|
|
|
lower_bound = self.add_season_to_data(lower_bound, segment_data, seasonality_offset, seasonality_index, Bound.LOWER) |
|
|
|
|
upper_bound = self.add_season_to_data(upper_bound, segment_data, seasonality_offset, seasonality_index, Bound.UPPER) |
|
|
|
|
|
|
|
|
|
# TODO: support multiple segments |
|
|
|
|
|
|
|
|
|
timestamps = utils.convert_series_to_timestamp_list(dataframe.timestamp) |
|
|
|
|
result_bounds = {} |
|
|
|
|
for bound_type, bound_data in bounds.items(): |
|
|
|
|
result_bounds[bound_type] = list(zip(timestamps, bound_data.values.tolist())) |
|
|
|
|
result = AnomalyProcessingResult(lower_bound=result_bounds.get(Bound.LOWER.value), upper_bound=result_bounds.get(Bound.UPPER.value)) |
|
|
|
|
return result |
|
|
|
|
lower_bound_timeseries = list(zip(timestamps, lower_bound.values.tolist())) |
|
|
|
|
upper_bound_timeseries = list(zip(timestamps, upper_bound.values.tolist())) |
|
|
|
|
|
|
|
|
|
if enable_bounds == Bound.ALL: |
|
|
|
|
return ProcessingResult(lower_bound_timeseries, upper_bound_timeseries) |
|
|
|
|
elif enable_bounds == Bound.UPPER: |
|
|
|
|
return ProcessingResult(upper_bound = upper_bound_timeseries) |
|
|
|
|
elif enable_bounds == Bound.LOWER: |
|
|
|
|
return ProcessingResult(lower_bound = lower_bound_timeseries) |
|
|
|
|
|
|
|
|
|
def add_season_to_data(self, data: pd.Series, segment: pd.Series, offset: int, seasonality: int, bound_type: Bound) -> pd.Series: |
|
|
|
|
#data - smoothed data to which seasonality will be added |
|
|
|
@ -236,33 +228,17 @@ class AnomalyDetector(ProcessingDetector):
|
|
|
|
|
#TODO: add seasonality for non empty parts |
|
|
|
|
continue |
|
|
|
|
if (idx - offset) % seasonality == 0: |
|
|
|
|
if bound_type == Bound.UPPER.value: |
|
|
|
|
if bound_type == Bound.UPPER: |
|
|
|
|
upper_segment_bound = self.get_bounds_for_segment(segment)[0] |
|
|
|
|
data = data.add(pd.Series(upper_segment_bound.values, index = segment.index + idx), fill_value = 0) |
|
|
|
|
elif bound_type == Bound.LOWER.value: |
|
|
|
|
elif bound_type == Bound.LOWER: |
|
|
|
|
lower_segment_bound = self.get_bounds_for_segment(segment)[1] |
|
|
|
|
data = data.add(pd.Series(lower_segment_bound.values * -1, index = segment.index + idx), fill_value = 0) |
|
|
|
|
else: |
|
|
|
|
raise ValueError(f'unknown {bound_type}') |
|
|
|
|
raise ValueError(f'unknown bound type: {bound_type.value}') |
|
|
|
|
|
|
|
|
|
return data[:len_smoothed_data] |
|
|
|
|
|
|
|
|
|
def concat_processing_results(self, processing_results: List[AnomalyProcessingResult]) -> Optional[AnomalyProcessingResult]: |
|
|
|
|
if len(processing_results) == 0: |
|
|
|
|
return None |
|
|
|
|
|
|
|
|
|
united_result = AnomalyProcessingResult() |
|
|
|
|
for result in processing_results: |
|
|
|
|
if result.lower_bound is not None: |
|
|
|
|
if united_result.lower_bound is None: united_result.lower_bound = [] |
|
|
|
|
united_result.lower_bound.extend(result.lower_bound) |
|
|
|
|
|
|
|
|
|
if result.upper_bound is not None: |
|
|
|
|
if united_result.upper_bound is None: united_result.upper_bound = [] |
|
|
|
|
united_result.upper_bound.extend(result.upper_bound) |
|
|
|
|
|
|
|
|
|
return united_result |
|
|
|
|
|
|
|
|
|
def get_bounds_for_segment(self, segment: pd.Series) -> Tuple[pd.Series, pd.Series]: |
|
|
|
|
''' |
|
|
|
|
segment is divided by the median to determine its top and bottom parts |
|
|
|
|