hastic-server/analytics/analytics/detectors/pattern_detector.py

import models

import asyncio
import logging
import config

import pandas as pd
from typing import Optional, Generator

from detectors import Detector
from buckets import DataBucket
from models import ModelCache
from utils import convert_pd_timestamp_to_ms


logger = logging.getLogger('PATTERN_DETECTOR')


def resolve_model_by_pattern(pattern: str) -> models.Model:
    if pattern == 'GENERAL':
        return models.GeneralModel()
    if pattern == 'PEAK':
        return models.PeakModel()
    if pattern == 'TROUGH':
        return models.TroughModel()
    if pattern == 'DROP':
        return models.DropModel()
    if pattern == 'JUMP':
        return models.JumpModel()
    if pattern == 'CUSTOM':
        return models.CustomModel()
    raise ValueError('Unknown pattern "%s"' % pattern)

AnalyticUnitId = str
class PatternDetector(Detector):

    MIN_BUCKET_SIZE = 150

    def __init__(self, pattern_type: str, analytic_unit_id: AnalyticUnitId):
        self.analytic_unit_id = analytic_unit_id
        self.pattern_type = pattern_type
        self.model = resolve_model_by_pattern(self.pattern_type)
        self.bucket = DataBucket()

    def train(self, dataframe: pd.DataFrame, segments: list, cache: Optional[models.ModelCache]) -> models.ModelCache:
        # TODO: pass only part of dataframe that has segments
        new_cache = self.model.fit(dataframe, segments, self.analytic_unit_id, cache)
        if new_cache == None or len(new_cache) == 0:
            logging.warning('new_cache is empty with data: {}, segments: {}, cache: {}, analytic unit: {}'.format(dataframe, segments, cache, self.analytic_unit_id))
        return {
            'cache': new_cache
        }

    async def detect(self, dataframe: pd.DataFrame, cache: Optional[models.ModelCache]) -> dict:
        logger.debug('Unit {} got {} data points for detection'.format(self.analytic_unit_id, len(dataframe)))
        # TODO: split and sleep (https://github.com/hastic/hastic-server/pull/124#discussion_r214085643)

        if not cache:
            msg = f'{self.analytic_unit_id} detection got invalid cache {cache}, skip detection'
            logger.error(msg)
            raise ValueError(msg)

        window_size = cache.get('WINDOW_SIZE')

        if not window_size:
            msg = f'{self.analytic_unit_id} detection got invalid window size {window_size}'

        chunks = self.__get_data_chunks(dataframe, window_size)

        segments = []
        segment_parser = lambda segment: { 'from': segment[0], 'to': segment[1] }
        for chunk in chunks:
            await asyncio.sleep(0)
            detected = self.model.detect(dataframe, self.analytic_unit_id, cache)
            for detected_segment in detected['segments']:
                detected_segment = segment_parser(detected_segment)
                if detected_segment not in segments:
                    segments.append(detected_segment)

        newCache = detected['cache']

        last_dataframe_time = dataframe.iloc[-1]['timestamp']
        last_detection_time = convert_pd_timestamp_to_ms(last_dataframe_time)
        return {
            'cache': newCache,
            'segments': segments,
            'lastDetectionTime': last_detection_time
        }

    def recieve_data(self, data: pd.DataFrame, cache: Optional[ModelCache]) -> Optional[dict]:
        logging.debug('Start recieve_data for analytic unit {}'.format(self.analytic_unit_id))
        data_without_nan = data.dropna()

        if len(data_without_nan) == 0:
            return None

        self.bucket.receive_data(data_without_nan)
        if cache == None:
            logging.debug('Recieve_data cache is None for task {}'.format(self.analytic_unit_id))
            cache = {}
        bucket_size = max(cache.get('WINDOW_SIZE', 0) * 3, self.MIN_BUCKET_SIZE)

        res = self.detect(self.bucket.data, cache)

        if len(self.bucket.data) > bucket_size:
            excess_data = len(self.bucket.data) - bucket_size
            self.bucket.drop_data(excess_data)
        logging.debug('End recieve_data for analytic unit: {} with res: {}'.format(self.analytic_unit_id, res))
        if res:
            return res
        else:
            return None

    def __get_data_chunks(self, dataframe: pd.DataFrame, window_size: int) -> Generator[pd.DataFrame, None, None]:
        """
        TODO: fix description
        Return generator, that yields dataframe's chunks. Chunks have 3 WINDOW_SIZE length and 2 WINDOW_SIZE step.
        Example: recieved dataframe: [0, 1, 2, 3, 4, 5], returned chunks [0, 1, 2], [2, 3, 4], [4, 5].
        """
        chunk_size = window_size * 100
        intersection = window_size

        data_len = len(dataframe)

        if data_len < chunk_size:
            return (chunk for chunk in (dataframe,))

        def slices():
            nonintersected = chunk_size - intersection
            mod = data_len % nonintersected
            chunks_number = data_len // nonintersected

            offset = 0
            for i in range(chunks_number):
                yield slice(offset, offset + nonintersected + 1)
                offset += nonintersected

            yield slice(offset, offset + mod)

        return (dataframe[chunk_slice] for chunk_slice in slices())
Split out models from detectors #98 (#101) * Create abstract model class * Move detectors/_detector -> models/_model * Update Model class * Change detectors to models and move fields to self.state * Use models instead of detectors in PatternDetector * Update inits in detectors/ and models/ * Add types to resolve_model_by_pattern * Add types to abstract Model class 6 years ago			`import models`
detectors to folder 6 years ago
Send data to detection in chunks #489 (#496) 5 years ago			`import asyncio`
Add src 6 years ago			`import logging`
folders config++ 6 years ago			`import config`

Add src 6 years ago			`import pandas as pd`
Send data to detection in chunks #489 (#496) 5 years ago			`from typing import Optional, Generator`
Add src 6 years ago
One panel - one worker #62 6 years ago			`from detectors import Detector`
Analytic unit worker bucket #273 (#297) 6 years ago			`from buckets import DataBucket`
Error: detect missing cache #299 (#302) * Pass cache to detect * rename AnalyticUnitCache to ModelCache * Send .data from cache * Drop nans from bucket && set window size from cache && check cache None * Read proper payload on DETECT message 6 years ago			`from models import ModelCache`
Wrong time in threshold segments #403 (#405) * Add convert_pd_timestamp_to_ms function to utils * Use datapoint time in segment if it is available 5 years ago			`from utils import convert_pd_timestamp_to_ms`
folders config++ 6 years ago
Add src 6 years ago
analytics clearup 6 years ago			`logger = logging.getLogger('PATTERN_DETECTOR')`
Add src 6 years ago

Split out models from detectors #98 (#101) * Create abstract model class * Move detectors/_detector -> models/_model * Update Model class * Change detectors to models and move fields to self.state * Use models instead of detectors in PatternDetector * Update inits in detectors/ and models/ * Add types to resolve_model_by_pattern * Add types to abstract Model class 6 years ago			`def resolve_model_by_pattern(pattern: str) -> models.Model:`
general predictor -> general model (#130) 6 years ago			`if pattern == 'GENERAL':`
			`return models.GeneralModel()`
upperase literals + server start bugfix 6 years ago			`if pattern == 'PEAK':`
Make all models work && add reverse peak model (#124) - Subtract min value from dataset before passing to model - Rename StepModel -> DropModel - Use cache to save state in all models - Return `Segment { 'from': <timestamp>, 'to': <timestamp>}` instead of `Segment { 'from': <index>, 'to': <index>}` in all models - Integrate new peaks model (from https://github.com/hastic/hastic-server/pull/123) - Integrate new reverse-peaks model (from https://github.com/hastic/hastic-server/pull/123) - Refactor: make `predict` method in `Model` not abstract and remove it from all children - Refactor: add abstract `do_predict` method to models 6 years ago			`return models.PeakModel()`
Reverse peak -> trough 6 years ago			`if pattern == 'TROUGH':`
			`return models.TroughModel()`
upperase literals + server start bugfix 6 years ago			`if pattern == 'DROP':`
Make all models work && add reverse peak model (#124) - Subtract min value from dataset before passing to model - Rename StepModel -> DropModel - Use cache to save state in all models - Return `Segment { 'from': <timestamp>, 'to': <timestamp>}` instead of `Segment { 'from': <index>, 'to': <index>}` in all models - Integrate new peaks model (from https://github.com/hastic/hastic-server/pull/123) - Integrate new reverse-peaks model (from https://github.com/hastic/hastic-server/pull/123) - Refactor: make `predict` method in `Model` not abstract and remove it from all children - Refactor: add abstract `do_predict` method to models 6 years ago			`return models.DropModel()`
upperase literals + server start bugfix 6 years ago			`if pattern == 'JUMP':`
Split out models from detectors #98 (#101) * Create abstract model class * Move detectors/_detector -> models/_model * Update Model class * Change detectors to models and move fields to self.state * Use models instead of detectors in PatternDetector * Update inits in detectors/ and models/ * Add types to resolve_model_by_pattern * Add types to abstract Model class 6 years ago			`return models.JumpModel()`
upperase literals + server start bugfix 6 years ago			`if pattern == 'CUSTOM':`
Add custom model 6 years ago			`return models.CustomModel()`
detectors cleanup & jump_detector integration 6 years ago			`raise ValueError('Unknown pattern "%s"' % pattern)`
Add src 6 years ago
Detection return empty result #347 (#348) * set constant window size * improve logging, save detected segments from push\pull process 6 years ago			`AnalyticUnitId = str`
One panel - one worker #62 6 years ago			`class PatternDetector(Detector):`
Add src 6 years ago
Send data to detection in chunks #489 (#496) 5 years ago			`MIN_BUCKET_SIZE = 150`

Detection return empty result #347 (#348) * set constant window size * improve logging, save detected segments from push\pull process 6 years ago			`def __init__(self, pattern_type: str, analytic_unit_id: AnalyticUnitId):`
			`self.analytic_unit_id = analytic_unit_id`
detectors to folder 6 years ago			`self.pattern_type = pattern_type`
Split out models from detectors #98 (#101) * Create abstract model class * Move detectors/_detector -> models/_model * Update Model class * Change detectors to models and move fields to self.state * Use models instead of detectors in PatternDetector * Update inits in detectors/ and models/ * Add types to resolve_model_by_pattern * Add types to abstract Model class 6 years ago			`self.model = resolve_model_by_pattern(self.pattern_type)`
Analytic unit worker bucket #273 (#297) 6 years ago			`self.bucket = DataBucket()`
Add src 6 years ago
Error: detect missing cache #299 (#302) * Pass cache to detect * rename AnalyticUnitCache to ModelCache * Send .data from cache * Drop nans from bucket && set window size from cache && check cache None * Read proper payload on DETECT message 6 years ago			`def train(self, dataframe: pd.DataFrame, segments: list, cache: Optional[models.ModelCache]) -> models.ModelCache:`
177-improve-drops-model 6 years ago			`# TODO: pass only part of dataframe that has segments`
Not-ending learning #264 (#484) - optimize general model - add logs 5 years ago			`new_cache = self.model.fit(dataframe, segments, self.analytic_unit_id, cache)`
Keyerror pattern model #471 (#476) 5 years ago			`if new_cache == None or len(new_cache) == 0:`
Not-ending learning #264 (#484) - optimize general model - add logs 5 years ago			`logging.warning('new_cache is empty with data: {}, segments: {}, cache: {}, analytic unit: {}'.format(dataframe, segments, cache, self.analytic_unit_id))`
Hotfix: return cache from pattern_detector.train 6 years ago			`return {`
Make all models work && add reverse peak model (#124) - Subtract min value from dataset before passing to model - Rename StepModel -> DropModel - Use cache to save state in all models - Return `Segment { 'from': <timestamp>, 'to': <timestamp>}` instead of `Segment { 'from': <index>, 'to': <index>}` in all models - Integrate new peaks model (from https://github.com/hastic/hastic-server/pull/123) - Integrate new reverse-peaks model (from https://github.com/hastic/hastic-server/pull/123) - Refactor: make `predict` method in `Model` not abstract and remove it from all children - Refactor: add abstract `do_predict` method to models 6 years ago			`'cache': new_cache`
Hotfix: return cache from pattern_detector.train 6 years ago			`}`
Add src 6 years ago
Revert "fix" This reverts commit c0a0ee5f12dae64e8fda35500947ee59227c38fc. 5 years ago			`async def detect(self, dataframe: pd.DataFrame, cache: Optional[models.ModelCache]) -> dict:`
Detection return empty result #347 (#348) * set constant window size * improve logging, save detected segments from push\pull process 6 years ago			`logger.debug('Unit {} got {} data points for detection'.format(self.analytic_unit_id, len(dataframe)))`
Make all models work && add reverse peak model (#124) - Subtract min value from dataset before passing to model - Rename StepModel -> DropModel - Use cache to save state in all models - Return `Segment { 'from': <timestamp>, 'to': <timestamp>}` instead of `Segment { 'from': <index>, 'to': <index>}` in all models - Integrate new peaks model (from https://github.com/hastic/hastic-server/pull/123) - Integrate new reverse-peaks model (from https://github.com/hastic/hastic-server/pull/123) - Refactor: make `predict` method in `Model` not abstract and remove it from all children - Refactor: add abstract `do_predict` method to models 6 years ago			`# TODO: split and sleep (https://github.com/hastic/hastic-server/pull/124#discussion_r214085643)`
Add src 6 years ago
Revert "fix" This reverts commit c0a0ee5f12dae64e8fda35500947ee59227c38fc. 5 years ago			`if not cache:`
			`msg = f'{self.analytic_unit_id} detection got invalid cache {cache}, skip detection'`
			`logger.error(msg)`
			`raise ValueError(msg)`

			`window_size = cache.get('WINDOW_SIZE')`

			`if not window_size:`
			`msg = f'{self.analytic_unit_id} detection got invalid window size {window_size}'`

			`chunks = self.__get_data_chunks(dataframe, window_size)`

			`segments = []`
			`segment_parser = lambda segment: { 'from': segment[0], 'to': segment[1] }`
			`for chunk in chunks:`
			`await asyncio.sleep(0)`
			`detected = self.model.detect(dataframe, self.analytic_unit_id, cache)`
			`for detected_segment in detected['segments']:`
			`detected_segment = segment_parser(detected_segment)`
			`if detected_segment not in segments:`
			`segments.append(detected_segment)`

Rename predict to detect #279 (#284) * dummy rename * fixes * renaming in analytics 6 years ago			`newCache = detected['cache']`
Add src 6 years ago
detectors cleanup & jump_detector integration 6 years ago			`last_dataframe_time = dataframe.iloc[-1]['timestamp']`
Wrong time in threshold segments #403 (#405) * Add convert_pd_timestamp_to_ms function to utils * Use datapoint time in segment if it is available 5 years ago			`last_detection_time = convert_pd_timestamp_to_ms(last_dataframe_time)`
Analytic unit cache start #117 (#120) 6 years ago			`return {`
Make all models work && add reverse peak model (#124) - Subtract min value from dataset before passing to model - Rename StepModel -> DropModel - Use cache to save state in all models - Return `Segment { 'from': <timestamp>, 'to': <timestamp>}` instead of `Segment { 'from': <index>, 'to': <index>}` in all models - Integrate new peaks model (from https://github.com/hastic/hastic-server/pull/123) - Integrate new reverse-peaks model (from https://github.com/hastic/hastic-server/pull/123) - Refactor: make `predict` method in `Model` not abstract and remove it from all children - Refactor: add abstract `do_predict` method to models 6 years ago			`'cache': newCache,`
Analytic unit cache start #117 (#120) 6 years ago			`'segments': segments,`
Rename predict to detect #279 (#284) * dummy rename * fixes * renaming in analytics 6 years ago			`'lastDetectionTime': last_detection_time`
Analytic unit cache start #117 (#120) 6 years ago			`}`
Analytic unit worker bucket #273 (#297) 6 years ago
ModelCache 6 years ago			`def recieve_data(self, data: pd.DataFrame, cache: Optional[ModelCache]) -> Optional[dict]:`
Not-ending learning #264 (#484) - optimize general model - add logs 5 years ago			`logging.debug('Start recieve_data for analytic unit {}'.format(self.analytic_unit_id))`
Fix wait for full bucket (#402) Fix wait for full bucket for pattern detector (#402) 5 years ago			`data_without_nan = data.dropna()`

			`if len(data_without_nan) == 0:`
			`return None`

			`self.bucket.receive_data(data_without_nan)`
Not-ending learning #264 (#484) - optimize general model - add logs 5 years ago			`if cache == None:`
			`logging.debug('Recieve_data cache is None for task {}'.format(self.analytic_unit_id))`
			`cache = {}`
Revert "fix" This reverts commit c0a0ee5f12dae64e8fda35500947ee59227c38fc. 5 years ago			`bucket_size = max(cache.get('WINDOW_SIZE', 0) * 3, self.MIN_BUCKET_SIZE)`
Analytic unit worker bucket #273 (#297) 6 years ago
Fix wait for full bucket (#402) Fix wait for full bucket for pattern detector (#402) 5 years ago			`res = self.detect(self.bucket.data, cache)`
Analytic bucket size #446 (#451) 5 years ago
			`if len(self.bucket.data) > bucket_size:`
			`excess_data = len(self.bucket.data) - bucket_size`
Analytic unit worker bucket #273 (#297) 6 years ago			`self.bucket.drop_data(excess_data)`
Not-ending learning #264 (#484) - optimize general model - add logs 5 years ago			`logging.debug('End recieve_data for analytic unit: {} with res: {}'.format(self.analytic_unit_id, res))`
Fix wait for full bucket (#402) Fix wait for full bucket for pattern detector (#402) 5 years ago			`if res:`
			`return res`
			`else:`
			`return None`
Send data to detection in chunks #489 (#496) 5 years ago
Revert "fix" This reverts commit c0a0ee5f12dae64e8fda35500947ee59227c38fc. 5 years ago			`def __get_data_chunks(self, dataframe: pd.DataFrame, window_size: int) -> Generator[pd.DataFrame, None, None]:`
			`"""`
			`TODO: fix description`
			`Return generator, that yields dataframe's chunks. Chunks have 3 WINDOW_SIZE length and 2 WINDOW_SIZE step.`
			`Example: recieved dataframe: [0, 1, 2, 3, 4, 5], returned chunks [0, 1, 2], [2, 3, 4], [4, 5].`
			`"""`
			`chunk_size = window_size * 100`
			`intersection = window_size`

			`data_len = len(dataframe)`

			`if data_len < chunk_size:`
			`return (chunk for chunk in (dataframe,))`

			`def slices():`
			`nonintersected = chunk_size - intersection`
			`mod = data_len % nonintersected`
			`chunks_number = data_len // nonintersected`

			`offset = 0`
			`for i in range(chunks_number):`
			`yield slice(offset, offset + nonintersected + 1)`
			`offset += nonintersected`

			`yield slice(offset, offset + mod)`

			`return (dataframe[chunk_slice] for chunk_slice in slices())`