Browse Source

Anomaly detector data bucket fixes (#27)

master
rozetko 4 years ago committed by GitHub
parent
commit
eef325307f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 25
      analytics/analytic_types/data_bucket.py
  2. 15
      analytics/detectors/anomaly_detector.py
  3. 17
      analytics/detectors/pattern_detector.py
  4. 8
      tests/test_bucket.py

25
analytics/analytic_types/data_bucket.py

@ -1,14 +1,33 @@
from typing import Optional
import pandas as pd import pandas as pd
class DataBucket: class DataBucket:
def __init__(self): def __init__(self, max_size: Optional[int] = None):
self.set_max_size(max_size)
self.data = pd.DataFrame([], columns=['timestamp', 'value']) self.data = pd.DataFrame([], columns=['timestamp', 'value'])
def receive_data(self, data: pd.DataFrame): def append_data(self, data: pd.DataFrame) -> None:
self.data = self.data.append(data, ignore_index=True) self.data = self.data.append(data, ignore_index=True)
def drop_data(self, count: int): if self.max_size == None:
return
bucket_size = self.get_current_size()
if bucket_size > self.max_size:
extra_data_count = bucket_size - self.max_size
self.drop_data(extra_data_count)
def drop_data(self, count: int) -> None:
if count > 0: if count > 0:
self.data = self.data.iloc[count:] self.data = self.data.iloc[count:]
def set_max_size(self, max_size: Optional[int] = None) -> None:
if max_size is not None and max_size < 0:
raise Exception(f'Can`t set negative max size for bucket: {max_size}')
self.max_size = max_size
def get_current_size(self) -> int:
return len(self.data)

15
analytics/detectors/anomaly_detector.py

@ -17,6 +17,9 @@ import utils
MAX_DEPENDENCY_LEVEL = 100 MAX_DEPENDENCY_LEVEL = 100
MIN_DEPENDENCY_FACTOR = 0.1 MIN_DEPENDENCY_FACTOR = 0.1
BASIC_ALPHA = 0.5 BASIC_ALPHA = 0.5
BUCKET_SIZE = MAX_DEPENDENCY_LEVEL
logger = logging.getLogger('ANOMALY_DETECTOR') logger = logging.getLogger('ANOMALY_DETECTOR')
@ -105,9 +108,12 @@ class AnomalyDetector(ProcessingDetector):
if len(data_without_nan) == 0: if len(data_without_nan) == 0:
return None return None
self.bucket.receive_data(data_without_nan) window_size = self.get_window_size(cache)
self.bucket.set_max_size(BUCKET_SIZE)
self.bucket.append_data(data_without_nan)
if len(self.bucket.data) >= self.get_window_size(cache): if self.bucket.get_current_size() >= window_size:
return self.detect(self.bucket.data, cache) return self.detect(self.bucket.data, cache)
return None return None
@ -264,7 +270,8 @@ class AnomalyDetector(ProcessingDetector):
yield Segment( yield Segment(
utils.convert_pd_timestamp_to_ms(segment_start), utils.convert_pd_timestamp_to_ms(segment_start),
utils.convert_pd_timestamp_to_ms(segment_end), utils.convert_pd_timestamp_to_ms(segment_end),
message=f'{val} out of {str(bound.value)} bound' # TODO: configurable decimals number
message=f'{val:.2f} out of {str(bound.value)} bound'
) )
in_segment = False in_segment = False
else: else:
@ -273,5 +280,5 @@ class AnomalyDetector(ProcessingDetector):
return Segment( return Segment(
utils.convert_pd_timestamp_to_ms(segment_start), utils.convert_pd_timestamp_to_ms(segment_start),
utils.convert_pd_timestamp_to_ms(segment_end), utils.convert_pd_timestamp_to_ms(segment_end),
message=f'{val} out of {str(bound.value)} bound' message=f'{val:.2f} out of {str(bound.value)} bound'
) )

17
analytics/detectors/pattern_detector.py

@ -110,24 +110,21 @@ class PatternDetector(Detector):
if len(data_without_nan) == 0: if len(data_without_nan) == 0:
return None return None
self.bucket.receive_data(data_without_nan)
# TODO: use ModelState # TODO: use ModelState
window_size = cache['windowSize'] window_size = cache['windowSize']
bucket_max_size = max(window_size * self.BUCKET_WINDOW_SIZE_FACTOR, self.MIN_BUCKET_SIZE)
self.bucket.set_max_size(bucket_max_size)
self.bucket.append_data(data_without_nan)
bucket_len = len(self.bucket.data) bucket_size = self.bucket.get_current_size()
if bucket_len < window_size * 2: if bucket_size < window_size * 2:
msg = f'{self.analytic_unit_id} bucket data {bucket_len} less than two window size {window_size * 2}, skip run detection from consume_data' msg = f'{self.analytic_unit_id} bucket data {bucket_size} less than two window size {window_size * 2}, skip run detection from consume_data'
logger.debug(msg) logger.debug(msg)
return None return None
res = self.detect(self.bucket.data, cache) res = self.detect(self.bucket.data, cache)
bucket_size = max(window_size * self.BUCKET_WINDOW_SIZE_FACTOR, self.MIN_BUCKET_SIZE)
if bucket_len > bucket_size:
excess_data = bucket_len - bucket_size
self.bucket.drop_data(excess_data)
logging.debug('End consume_data for analytic unit: {} with res: {}'.format(self.analytic_unit_id, str(res.to_json()))) logging.debug('End consume_data for analytic unit: {} with res: {}'.format(self.analytic_unit_id, str(res.to_json())))
if res: if res:

8
tests/test_bucket.py

@ -13,17 +13,17 @@ class TestBucket(unittest.TestCase):
data_val = list(range(6)) data_val = list(range(6))
timestamp_list = create_list_of_timestamps(len(data_val)) timestamp_list = create_list_of_timestamps(len(data_val))
for val in data_val: for val in data_val:
bucket.receive_data(get_pd_dataframe([val], [1523889000000 + val])) bucket.append_data(get_pd_dataframe([val], [1523889000000 + val]))
for idx, row in bucket.data.iterrows(): for idx, row in bucket.data.iterrows():
self.assertEqual(data_val[idx], row['value']) self.assertEqual(data_val[idx], row['value'])
self.assertEqual(timestamp_list[idx], row['timestamp']) self.assertEqual(timestamp_list[idx], row['timestamp'])
def test_drop_data(self): def test_max_size(self):
bucket = DataBucket() bucket = DataBucket()
data_val = list(range(10)) data_val = list(range(10))
timestamp_list = create_list_of_timestamps(len(data_val)) timestamp_list = create_list_of_timestamps(len(data_val))
bucket.receive_data(get_pd_dataframe(data_val, timestamp_list)) bucket.set_max_size(5)
bucket.drop_data(5) bucket.append_data(get_pd_dataframe(data_val, timestamp_list))
expected_data = data_val[5:] expected_data = data_val[5:]
expected_timestamp = timestamp_list[5:] expected_timestamp = timestamp_list[5:]
self.assertEqual(expected_data, bucket.data['value'].tolist()) self.assertEqual(expected_data, bucket.data['value'].tolist())

Loading…
Cancel
Save