Browse Source

Send data to detection in chunks #489 (#496)

pull/1/head
Evgeny Smyshlyaev 6 years ago committed by rozetko
parent
commit
d1f8b80c9e
  1. 3
      analytics/analytics/analytic_unit_worker.py
  2. 2
      analytics/analytics/detectors/detector.py
  3. 60
      analytics/analytics/detectors/pattern_detector.py
  4. 2
      analytics/analytics/detectors/threshold_detector.py
  5. 2
      analytics/bin/server
  6. 22
      analytics/tests/test_detector_chunks.py

3
analytics/analytics/analytic_unit_worker.py

@ -35,7 +35,8 @@ class AnalyticUnitWorker:
raise Exception('Timeout ({}s) exceeded while learning'.format(config.LEARNING_TIMEOUT)) raise Exception('Timeout ({}s) exceeded while learning'.format(config.LEARNING_TIMEOUT))
async def do_detect(self, data: pd.DataFrame, cache: Optional[ModelCache]) -> dict: async def do_detect(self, data: pd.DataFrame, cache: Optional[ModelCache]) -> dict:
return self._detector.detect(data, cache) # TODO: return without await
return await self._detector.detect(data, cache)
def cancel(self): def cancel(self):
if self._training_future is not None: if self._training_future is not None:

2
analytics/analytics/detectors/detector.py

@ -14,7 +14,7 @@ class Detector(ABC):
pass pass
@abstractmethod @abstractmethod
def detect(self, dataframe: DataFrame, cache: Optional[ModelCache]) -> dict: async def detect(self, dataframe: DataFrame, cache: Optional[ModelCache]) -> dict:
pass pass
@abstractmethod @abstractmethod

60
analytics/analytics/detectors/pattern_detector.py

@ -1,10 +1,11 @@
import models import models
import asyncio
import logging import logging
import config import config
import pandas as pd import pandas as pd
from typing import Optional from typing import Optional, Generator
from detectors import Detector from detectors import Detector
from buckets import DataBucket from buckets import DataBucket
@ -33,11 +34,12 @@ def resolve_model_by_pattern(pattern: str) -> models.Model:
AnalyticUnitId = str AnalyticUnitId = str
class PatternDetector(Detector): class PatternDetector(Detector):
MIN_BUCKET_SIZE = 150
def __init__(self, pattern_type: str, analytic_unit_id: AnalyticUnitId): def __init__(self, pattern_type: str, analytic_unit_id: AnalyticUnitId):
self.analytic_unit_id = analytic_unit_id self.analytic_unit_id = analytic_unit_id
self.pattern_type = pattern_type self.pattern_type = pattern_type
self.model = resolve_model_by_pattern(self.pattern_type) self.model = resolve_model_by_pattern(self.pattern_type)
self.min_bucket_size = 150
self.bucket = DataBucket() self.bucket = DataBucket()
def train(self, dataframe: pd.DataFrame, segments: list, cache: Optional[models.ModelCache]) -> models.ModelCache: def train(self, dataframe: pd.DataFrame, segments: list, cache: Optional[models.ModelCache]) -> models.ModelCache:
@ -49,12 +51,32 @@ class PatternDetector(Detector):
'cache': new_cache 'cache': new_cache
} }
def detect(self, dataframe: pd.DataFrame, cache: Optional[models.ModelCache]) -> dict: async def detect(self, dataframe: pd.DataFrame, cache: Optional[models.ModelCache]) -> dict:
logger.debug('Unit {} got {} data points for detection'.format(self.analytic_unit_id, len(dataframe))) logger.debug('Unit {} got {} data points for detection'.format(self.analytic_unit_id, len(dataframe)))
# TODO: split and sleep (https://github.com/hastic/hastic-server/pull/124#discussion_r214085643) # TODO: split and sleep (https://github.com/hastic/hastic-server/pull/124#discussion_r214085643)
if not cache:
msg = f'{self.analytic_unit_id} detection got invalid cache {cache}, skip detection'
logger.error(msg)
raise ValueError(msg)
window_size = cache.get('WINDOW_SIZE')
if not window_size:
msg = f'{self.analytic_unit_id} detection got invalid window size {window_size}'
chunks = self.__get_data_chunks(dataframe, window_size)
segments = []
segment_parser = lambda segment: { 'from': segment[0], 'to': segment[1] }
for chunk in chunks:
await asyncio.sleep(0)
detected = self.model.detect(dataframe, self.analytic_unit_id, cache) detected = self.model.detect(dataframe, self.analytic_unit_id, cache)
for detected_segment in detected['segments']:
detected_segment = segment_parser(detected_segment)
if detected_segment not in segments:
segments.append(detected_segment)
segments = [{ 'from': segment[0], 'to': segment[1] } for segment in detected['segments']]
newCache = detected['cache'] newCache = detected['cache']
last_dataframe_time = dataframe.iloc[-1]['timestamp'] last_dataframe_time = dataframe.iloc[-1]['timestamp']
@ -76,7 +98,7 @@ class PatternDetector(Detector):
if cache == None: if cache == None:
logging.debug('Recieve_data cache is None for task {}'.format(self.analytic_unit_id)) logging.debug('Recieve_data cache is None for task {}'.format(self.analytic_unit_id))
cache = {} cache = {}
bucket_size = max(cache.get('WINDOW_SIZE', 0) * 3, self.min_bucket_size) bucket_size = max(cache.get('WINDOW_SIZE', 0) * 3, self.MIN_BUCKET_SIZE)
res = self.detect(self.bucket.data, cache) res = self.detect(self.bucket.data, cache)
@ -88,3 +110,31 @@ class PatternDetector(Detector):
return res return res
else: else:
return None return None
def __get_data_chunks(self, dataframe: pd.DataFrame, window_size: int) -> Generator[pd.DataFrame, None, None]:
"""
TODO: fix description
Return generator, that yields dataframe's chunks. Chunks have 3 WINDOW_SIZE length and 2 WINDOW_SIZE step.
Example: recieved dataframe: [0, 1, 2, 3, 4, 5], returned chunks [0, 1, 2], [2, 3, 4], [4, 5].
"""
chunk_size = window_size * 100
intersection = window_size
data_len = len(dataframe)
if data_len < chunk_size:
return (chunk for chunk in (dataframe,))
def slices():
nonintersected = chunk_size - intersection
mod = data_len % nonintersected
chunks_number = data_len // nonintersected
offset = 0
for i in range(chunks_number):
yield slice(offset, offset + nonintersected + 1)
offset += nonintersected
yield slice(offset, offset + mod)
return (dataframe[chunk_slice] for chunk_slice in slices())

2
analytics/analytics/detectors/threshold_detector.py

@ -25,7 +25,7 @@ class ThresholdDetector(Detector):
} }
} }
def detect(self, dataframe: pd.DataFrame, cache: ModelCache) -> dict: async def detect(self, dataframe: pd.DataFrame, cache: ModelCache) -> dict:
if cache == None: if cache == None:
raise 'Threshold detector error: cannot detect before learning' raise 'Threshold detector error: cannot detect before learning'
value = cache['value'] value = cache['value']

2
analytics/bin/server

@ -100,7 +100,7 @@ async def app_loop():
if __name__ == "__main__": if __name__ == "__main__":
loop = asyncio.get_event_loop() loop = asyncio.get_event_loop()
loop.set_debug(True) #loop.set_debug(True)
logger.info("Ok") logger.info("Ok")
server_service, data_service, analytic_unit_manager = init_services() server_service, data_service, analytic_unit_manager = init_services()
print('Analytics process is running') # we need to print to stdout and flush print('Analytics process is running') # we need to print to stdout and flush

22
analytics/tests/test_detector_chunks.py

@ -0,0 +1,22 @@
import unittest
from detectors.pattern_detector import PatternDetector
def rlist(start, stop):
return [x for x in range(start, stop + 1)]
class TestUtils(unittest.TestCase):
def test_chunks_generator(self):
window_size = 1
cases = [
([], [[]]),
(rlist(0, 300), [rlist(0,99),rlist(99,198),rlist(198,297),rlist(297,300)])
]
for data, expected_chunks in cases:
chunks = tuple(PatternDetector._PatternDetector__get_data_chunks(None, data, window_size))
self.assertSequenceEqual(chunks, expected_chunks)
if __name__ == '__main__':
unittest.main()
Loading…
Cancel
Save