|
|
@ -6,9 +6,10 @@ import math |
|
|
|
from typing import Optional, Union, List, Tuple, Generator |
|
|
|
from typing import Optional, Union, List, Tuple, Generator |
|
|
|
|
|
|
|
|
|
|
|
from analytic_types import AnalyticUnitId, ModelCache |
|
|
|
from analytic_types import AnalyticUnitId, ModelCache |
|
|
|
from analytic_types.detector_typing import DetectionResult, ProcessingResult |
|
|
|
from analytic_types.detector_typing import DetectionResult, ProcessingResult, Bound |
|
|
|
from analytic_types.data_bucket import DataBucket |
|
|
|
from analytic_types.data_bucket import DataBucket |
|
|
|
from analytic_types.segment import Segment, AnomalyDetectorSegment |
|
|
|
from analytic_types.segment import Segment, AnomalyDetectorSegment |
|
|
|
|
|
|
|
from analytic_types.cache import AnomalyCache |
|
|
|
from detectors import Detector, ProcessingDetector |
|
|
|
from detectors import Detector, ProcessingDetector |
|
|
|
import utils |
|
|
|
import utils |
|
|
|
|
|
|
|
|
|
|
@ -17,10 +18,6 @@ MIN_DEPENDENCY_FACTOR = 0.1 |
|
|
|
BASIC_ALPHA = 0.5 |
|
|
|
BASIC_ALPHA = 0.5 |
|
|
|
logger = logging.getLogger('ANOMALY_DETECTOR') |
|
|
|
logger = logging.getLogger('ANOMALY_DETECTOR') |
|
|
|
|
|
|
|
|
|
|
|
class Bound(Enum): |
|
|
|
|
|
|
|
ALL = 'ALL' |
|
|
|
|
|
|
|
UPPER = 'UPPER' |
|
|
|
|
|
|
|
LOWER = 'LOWER' |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class AnomalyDetector(ProcessingDetector): |
|
|
|
class AnomalyDetector(ProcessingDetector): |
|
|
|
|
|
|
|
|
|
|
@ -29,25 +26,15 @@ class AnomalyDetector(ProcessingDetector): |
|
|
|
self.bucket = DataBucket() |
|
|
|
self.bucket = DataBucket() |
|
|
|
|
|
|
|
|
|
|
|
def train(self, dataframe: pd.DataFrame, payload: Union[list, dict], cache: Optional[ModelCache]) -> ModelCache: |
|
|
|
def train(self, dataframe: pd.DataFrame, payload: Union[list, dict], cache: Optional[ModelCache]) -> ModelCache: |
|
|
|
segments = payload.get('segments') |
|
|
|
cache = AnomalyCache.from_json(payload) |
|
|
|
enable_bounds = Bound(payload.get('enableBounds') or 'ALL') |
|
|
|
cache.time_step = utils.find_interval(dataframe) |
|
|
|
prepared_segments = [] |
|
|
|
segments = cache.segments |
|
|
|
time_step = utils.find_interval(dataframe) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
new_cache = { |
|
|
|
|
|
|
|
'confidence': payload['confidence'], |
|
|
|
|
|
|
|
'alpha': payload['alpha'], |
|
|
|
|
|
|
|
'timeStep': time_step, |
|
|
|
|
|
|
|
'enableBounds': enable_bounds.value |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if segments is not None: |
|
|
|
if len(segments) > 0: |
|
|
|
seasonality = payload.get('seasonality') |
|
|
|
seasonality = cache.seasonality |
|
|
|
assert seasonality is not None and seasonality > 0, \ |
|
|
|
prepared_segments = [] |
|
|
|
f'{self.analytic_unit_id} got invalid seasonality {seasonality}' |
|
|
|
|
|
|
|
parsed_segments = map(Segment.from_json, segments) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for segment in parsed_segments: |
|
|
|
for segment in segments: |
|
|
|
segment_len = (int(segment.to_timestamp) - int(segment.from_timestamp)) |
|
|
|
segment_len = (int(segment.to_timestamp) - int(segment.from_timestamp)) |
|
|
|
assert segment_len <= seasonality, \ |
|
|
|
assert segment_len <= seasonality, \ |
|
|
|
f'seasonality {seasonality} must be greater than segment length {segment_len}' |
|
|
|
f'seasonality {seasonality} must be greater than segment length {segment_len}' |
|
|
@ -55,17 +42,17 @@ class AnomalyDetector(ProcessingDetector): |
|
|
|
from_index = utils.timestamp_to_index(dataframe, pd.to_datetime(segment.from_timestamp, unit='ms')) |
|
|
|
from_index = utils.timestamp_to_index(dataframe, pd.to_datetime(segment.from_timestamp, unit='ms')) |
|
|
|
to_index = utils.timestamp_to_index(dataframe, pd.to_datetime(segment.to_timestamp, unit='ms')) |
|
|
|
to_index = utils.timestamp_to_index(dataframe, pd.to_datetime(segment.to_timestamp, unit='ms')) |
|
|
|
segment_data = dataframe[from_index : to_index] |
|
|
|
segment_data = dataframe[from_index : to_index] |
|
|
|
prepared_segments.append({ |
|
|
|
prepared_segments.append( |
|
|
|
'from': segment.from_timestamp, |
|
|
|
AnomalyDetectorSegment( |
|
|
|
'to': segment.to_timestamp, |
|
|
|
segment.from_timestamp, |
|
|
|
'data': segment_data.value.tolist() |
|
|
|
segment.to_timestamp, |
|
|
|
}) |
|
|
|
segment_data.value.tolist() |
|
|
|
|
|
|
|
) |
|
|
|
new_cache['seasonality'] = seasonality |
|
|
|
) |
|
|
|
new_cache['segments'] = prepared_segments |
|
|
|
cache.set_segments(prepared_segments) |
|
|
|
|
|
|
|
|
|
|
|
return { |
|
|
|
return { |
|
|
|
'cache': new_cache |
|
|
|
'cache': cache.to_json() |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
# TODO: ModelCache -> DetectorState |
|
|
|
# TODO: ModelCache -> DetectorState |
|
|
@ -74,41 +61,37 @@ class AnomalyDetector(ProcessingDetector): |
|
|
|
raise f'Analytic unit {self.analytic_unit_id} got empty cache' |
|
|
|
raise f'Analytic unit {self.analytic_unit_id} got empty cache' |
|
|
|
data = dataframe['value'] |
|
|
|
data = dataframe['value'] |
|
|
|
|
|
|
|
|
|
|
|
# TODO: use class for cache to avoid using string literals |
|
|
|
cache = AnomalyCache.from_json(cache) |
|
|
|
alpha = self.get_value_from_cache(cache, 'alpha', required = True) |
|
|
|
segments = cache.segments |
|
|
|
confidence = self.get_value_from_cache(cache, 'confidence', required = True) |
|
|
|
enabled_bounds = cache.get_enabled_bounds() |
|
|
|
segments = self.get_value_from_cache(cache, 'segments') |
|
|
|
|
|
|
|
enable_bounds = Bound(self.get_value_from_cache(cache, 'enableBounds') or 'ALL') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
smoothed_data = utils.exponential_smoothing(data, alpha) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
lower_bound = smoothed_data - confidence |
|
|
|
|
|
|
|
upper_bound = smoothed_data + confidence |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if segments is not None: |
|
|
|
smoothed_data = utils.exponential_smoothing(data, cache.alpha) |
|
|
|
parsed_segments = map(AnomalyDetectorSegment.from_json, segments) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
time_step = self.get_value_from_cache(cache, 'timeStep', required = True) |
|
|
|
lower_bound = smoothed_data - cache.confidence |
|
|
|
seasonality = self.get_value_from_cache(cache, 'seasonality', required = True) |
|
|
|
upper_bound = smoothed_data + cache.confidence |
|
|
|
assert seasonality > 0, \ |
|
|
|
|
|
|
|
f'{self.analytic_unit_id} got invalid seasonality {seasonality}' |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if len(segments) > 0: |
|
|
|
data_start_time = utils.convert_pd_timestamp_to_ms(dataframe['timestamp'][0]) |
|
|
|
data_start_time = utils.convert_pd_timestamp_to_ms(dataframe['timestamp'][0]) |
|
|
|
|
|
|
|
|
|
|
|
for segment in parsed_segments: |
|
|
|
for segment in segments: |
|
|
|
seasonality_index = seasonality // time_step |
|
|
|
seasonality_index = cache.seasonality // cache.time_step |
|
|
|
seasonality_offset = self.get_seasonality_offset(segment.from_timestamp, seasonality, data_start_time, time_step) |
|
|
|
seasonality_offset = self.get_seasonality_offset( |
|
|
|
|
|
|
|
segment.from_timestamp, |
|
|
|
|
|
|
|
cache.seasonality, |
|
|
|
|
|
|
|
data_start_time, |
|
|
|
|
|
|
|
cache.time_step |
|
|
|
|
|
|
|
) |
|
|
|
segment_data = pd.Series(segment.data) |
|
|
|
segment_data = pd.Series(segment.data) |
|
|
|
|
|
|
|
|
|
|
|
lower_bound = self.add_season_to_data(lower_bound, segment_data, seasonality_offset, seasonality_index, Bound.LOWER) |
|
|
|
lower_bound = self.add_season_to_data(lower_bound, segment_data, seasonality_offset, seasonality_index, Bound.LOWER) |
|
|
|
upper_bound = self.add_season_to_data(upper_bound, segment_data, seasonality_offset, seasonality_index, Bound.UPPER) |
|
|
|
upper_bound = self.add_season_to_data(upper_bound, segment_data, seasonality_offset, seasonality_index, Bound.UPPER) |
|
|
|
|
|
|
|
|
|
|
|
detected_segments = list(self.detections_generator(dataframe, upper_bound, lower_bound, enable_bounds)) |
|
|
|
detected_segments = list(self.detections_generator(dataframe, upper_bound, lower_bound, enabled_bounds)) |
|
|
|
|
|
|
|
|
|
|
|
last_dataframe_time = dataframe.iloc[-1]['timestamp'] |
|
|
|
last_dataframe_time = dataframe.iloc[-1]['timestamp'] |
|
|
|
last_detection_time = utils.convert_pd_timestamp_to_ms(last_dataframe_time) |
|
|
|
last_detection_time = utils.convert_pd_timestamp_to_ms(last_dataframe_time) |
|
|
|
|
|
|
|
|
|
|
|
return DetectionResult(cache, detected_segments, last_detection_time) |
|
|
|
return DetectionResult(cache.to_json(), detected_segments, last_detection_time) |
|
|
|
|
|
|
|
|
|
|
|
def consume_data(self, data: pd.DataFrame, cache: Optional[ModelCache]) -> Optional[DetectionResult]: |
|
|
|
def consume_data(self, data: pd.DataFrame, cache: Optional[ModelCache]) -> Optional[DetectionResult]: |
|
|
|
if cache is None: |
|
|
|
if cache is None: |
|
|
@ -138,14 +121,15 @@ class AnomalyDetector(ProcessingDetector): |
|
|
|
|
|
|
|
|
|
|
|
if cache is None: |
|
|
|
if cache is None: |
|
|
|
raise ValueError('anomaly detector got None cache') |
|
|
|
raise ValueError('anomaly detector got None cache') |
|
|
|
|
|
|
|
cache = AnomalyCache.from_json(cache) |
|
|
|
|
|
|
|
|
|
|
|
for level in range(1, MAX_DEPENDENCY_LEVEL): |
|
|
|
for level in range(1, MAX_DEPENDENCY_LEVEL): |
|
|
|
if (1 - cache['alpha']) ** level < MIN_DEPENDENCY_FACTOR: |
|
|
|
if (1 - cache.alpha) ** level < MIN_DEPENDENCY_FACTOR: |
|
|
|
break |
|
|
|
break |
|
|
|
|
|
|
|
|
|
|
|
seasonality = 0 |
|
|
|
seasonality = 0 |
|
|
|
if cache.get('segments') is not None and cache['seasonality'] > 0: |
|
|
|
if len(cache.segments) > 0: |
|
|
|
seasonality = cache['seasonality'] // cache['timeStep'] |
|
|
|
seasonality = cache.seasonality // cache.time_step |
|
|
|
return max(level, seasonality) |
|
|
|
return max(level, seasonality) |
|
|
|
|
|
|
|
|
|
|
|
def concat_detection_results(self, detections: List[DetectionResult]) -> DetectionResult: |
|
|
|
def concat_detection_results(self, detections: List[DetectionResult]) -> DetectionResult: |
|
|
@ -160,31 +144,28 @@ class AnomalyDetector(ProcessingDetector): |
|
|
|
|
|
|
|
|
|
|
|
# TODO: remove duplication with detect() |
|
|
|
# TODO: remove duplication with detect() |
|
|
|
def process_data(self, dataframe: pd.DataFrame, cache: ModelCache) -> ProcessingResult: |
|
|
|
def process_data(self, dataframe: pd.DataFrame, cache: ModelCache) -> ProcessingResult: |
|
|
|
segments = self.get_value_from_cache(cache, 'segments') |
|
|
|
cache = AnomalyCache.from_json(cache) |
|
|
|
alpha = self.get_value_from_cache(cache, 'alpha', required = True) |
|
|
|
segments = cache.segments |
|
|
|
confidence = self.get_value_from_cache(cache, 'confidence', required = True) |
|
|
|
enabled_bounds = cache.get_enabled_bounds() |
|
|
|
enable_bounds = Bound(self.get_value_from_cache(cache, 'enableBounds') or 'ALL') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# TODO: exponential_smoothing should return dataframe with related timestamps |
|
|
|
# TODO: exponential_smoothing should return dataframe with related timestamps |
|
|
|
smoothed_data = utils.exponential_smoothing(dataframe['value'], alpha) |
|
|
|
smoothed_data = utils.exponential_smoothing(dataframe['value'], cache.alpha) |
|
|
|
|
|
|
|
|
|
|
|
lower_bound = smoothed_data - confidence |
|
|
|
|
|
|
|
upper_bound = smoothed_data + confidence |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if segments is not None: |
|
|
|
lower_bound = smoothed_data - cache.confidence |
|
|
|
segments: List[AnomalyDetectorSegment] = map(AnomalyDetectorSegment.from_json, segments) |
|
|
|
upper_bound = smoothed_data + cache.confidence |
|
|
|
seasonality = self.get_value_from_cache(cache, 'seasonality', required = True) |
|
|
|
|
|
|
|
assert seasonality > 0, \ |
|
|
|
|
|
|
|
f'{self.analytic_unit_id} got invalid seasonality {seasonality}' |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if len(segments) > 0: |
|
|
|
data_start_time = utils.convert_pd_timestamp_to_ms(dataframe['timestamp'][0]) |
|
|
|
data_start_time = utils.convert_pd_timestamp_to_ms(dataframe['timestamp'][0]) |
|
|
|
|
|
|
|
|
|
|
|
time_step = self.get_value_from_cache(cache, 'timeStep', required = True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for segment in segments: |
|
|
|
for segment in segments: |
|
|
|
seasonality_index = seasonality // time_step |
|
|
|
seasonality_index = cache.seasonality // cache.time_step |
|
|
|
# TODO: move it to utils and add tests |
|
|
|
# TODO: move it to utils and add tests |
|
|
|
seasonality_offset = self.get_seasonality_offset(segment.from_timestamp, seasonality, data_start_time, time_step) |
|
|
|
seasonality_offset = self.get_seasonality_offset( |
|
|
|
|
|
|
|
segment.from_timestamp, |
|
|
|
|
|
|
|
cache.seasonality, |
|
|
|
|
|
|
|
data_start_time, |
|
|
|
|
|
|
|
cache.time_step |
|
|
|
|
|
|
|
) |
|
|
|
segment_data = pd.Series(segment.data) |
|
|
|
segment_data = pd.Series(segment.data) |
|
|
|
|
|
|
|
|
|
|
|
lower_bound = self.add_season_to_data(lower_bound, segment_data, seasonality_offset, seasonality_index, Bound.LOWER) |
|
|
|
lower_bound = self.add_season_to_data(lower_bound, segment_data, seasonality_offset, seasonality_index, Bound.LOWER) |
|
|
@ -196,11 +177,11 @@ class AnomalyDetector(ProcessingDetector): |
|
|
|
lower_bound_timeseries = list(zip(timestamps, lower_bound.values.tolist())) |
|
|
|
lower_bound_timeseries = list(zip(timestamps, lower_bound.values.tolist())) |
|
|
|
upper_bound_timeseries = list(zip(timestamps, upper_bound.values.tolist())) |
|
|
|
upper_bound_timeseries = list(zip(timestamps, upper_bound.values.tolist())) |
|
|
|
|
|
|
|
|
|
|
|
if enable_bounds == Bound.ALL: |
|
|
|
if enabled_bounds == Bound.ALL: |
|
|
|
return ProcessingResult(lower_bound_timeseries, upper_bound_timeseries) |
|
|
|
return ProcessingResult(lower_bound_timeseries, upper_bound_timeseries) |
|
|
|
elif enable_bounds == Bound.UPPER: |
|
|
|
elif enabled_bounds == Bound.UPPER: |
|
|
|
return ProcessingResult(upper_bound = upper_bound_timeseries) |
|
|
|
return ProcessingResult(upper_bound = upper_bound_timeseries) |
|
|
|
elif enable_bounds == Bound.LOWER: |
|
|
|
elif enabled_bounds == Bound.LOWER: |
|
|
|
return ProcessingResult(lower_bound = lower_bound_timeseries) |
|
|
|
return ProcessingResult(lower_bound = lower_bound_timeseries) |
|
|
|
|
|
|
|
|
|
|
|
def add_season_to_data(self, data: pd.Series, segment: pd.Series, offset: int, seasonality: int, bound_type: Bound) -> pd.Series: |
|
|
|
def add_season_to_data(self, data: pd.Series, segment: pd.Series, offset: int, seasonality: int, bound_type: Bound) -> pd.Series: |
|
|
@ -275,14 +256,14 @@ class AnomalyDetector(ProcessingDetector): |
|
|
|
dataframe: pd.DataFrame, |
|
|
|
dataframe: pd.DataFrame, |
|
|
|
upper_bound: pd.DataFrame, |
|
|
|
upper_bound: pd.DataFrame, |
|
|
|
lower_bound: pd.DataFrame, |
|
|
|
lower_bound: pd.DataFrame, |
|
|
|
enable_bounds: Bound |
|
|
|
enabled_bounds: Bound |
|
|
|
) -> Generator[Segment, None, Segment]: |
|
|
|
) -> Generator[Segment, None, Segment]: |
|
|
|
in_segment = False |
|
|
|
in_segment = False |
|
|
|
segment_start = 0 |
|
|
|
segment_start = 0 |
|
|
|
bound: Bound = None |
|
|
|
bound: Bound = None |
|
|
|
for idx, val in enumerate(dataframe['value'].values): |
|
|
|
for idx, val in enumerate(dataframe['value'].values): |
|
|
|
if val > upper_bound.values[idx]: |
|
|
|
if val > upper_bound.values[idx]: |
|
|
|
if enable_bounds == Bound.UPPER or enable_bounds == Bound.ALL: |
|
|
|
if enabled_bounds == Bound.UPPER or enabled_bounds == Bound.ALL: |
|
|
|
if not in_segment: |
|
|
|
if not in_segment: |
|
|
|
in_segment = True |
|
|
|
in_segment = True |
|
|
|
segment_start = dataframe['timestamp'][idx] |
|
|
|
segment_start = dataframe['timestamp'][idx] |
|
|
@ -290,7 +271,7 @@ class AnomalyDetector(ProcessingDetector): |
|
|
|
continue |
|
|
|
continue |
|
|
|
|
|
|
|
|
|
|
|
if val < lower_bound.values[idx]: |
|
|
|
if val < lower_bound.values[idx]: |
|
|
|
if enable_bounds == Bound.LOWER or enable_bounds == Bound.ALL: |
|
|
|
if enabled_bounds == Bound.LOWER or enabled_bounds == Bound.ALL: |
|
|
|
if not in_segment: |
|
|
|
if not in_segment: |
|
|
|
in_segment = True |
|
|
|
in_segment = True |
|
|
|
segment_start = dataframe['timestamp'][idx] |
|
|
|
segment_start = dataframe['timestamp'][idx] |
|
|
|