analytics/analytics/analytic_unit_worker.py

import config
import detectors
import logging
import pandas as pd
from typing import Optional, Union, Generator, List, Tuple
import concurrent.futures
import asyncio
import utils
from utils import get_intersected_chunks, get_chunks, prepare_data

from analytic_types import ModelCache, TimeSeries
from analytic_types.detector import DetectionResult

logger = logging.getLogger('AnalyticUnitWorker')


class AnalyticUnitWorker:

    CHUNK_WINDOW_SIZE_FACTOR = 100
    CHUNK_INTERSECTION_FACTOR = 2

    assert CHUNK_WINDOW_SIZE_FACTOR > CHUNK_INTERSECTION_FACTOR, \
        'CHUNK_INTERSECTION_FACTOR should be less than CHUNK_WINDOW_SIZE_FACTOR'

    def __init__(self, analytic_unit_id: str, detector: detectors.Detector, executor: concurrent.futures.Executor):
        self.analytic_unit_id = analytic_unit_id
        self._detector = detector
        self._executor: concurrent.futures.Executor = executor
        self._training_future: asyncio.Future = None

    async def do_train(
        self, payload: Union[list, dict], data: TimeSeries, cache: Optional[ModelCache]
    ) -> Optional[ModelCache]:

        dataframe = prepare_data(data)

        cfuture: concurrent.futures.Future = self._executor.submit(
            self._detector.train, dataframe, payload, cache
        )
        self._training_future = asyncio.wrap_future(cfuture)
        try:
            new_cache: ModelCache = await asyncio.wait_for(self._training_future, timeout = config.LEARNING_TIMEOUT)
            return new_cache
        except asyncio.CancelledError:
            return None
        except asyncio.TimeoutError:
            raise Exception('Timeout ({}s) exceeded while learning'.format(config.LEARNING_TIMEOUT))

    async def do_detect(self, data: pd.DataFrame, cache: Optional[ModelCache]) -> DetectionResult:

        window_size = self._detector.get_window_size(cache)
        chunk_size = window_size * self.CHUNK_WINDOW_SIZE_FACTOR
        chunk_intersection = window_size * self.CHUNK_INTERSECTION_FACTOR

        detections: List[DetectionResult] = []
        chunks = []
        # XXX: get_chunks(data, chunk_size) == get_intersected_chunks(data, 0, chunk_size)
        if self._detector.is_detection_intersected():
            chunks = get_intersected_chunks(data, chunk_intersection, chunk_size)
        else:
            chunks = get_chunks(data, chunk_size)

        for chunk in chunks:
            await asyncio.sleep(0)
            chunk_dataframe = prepare_data(chunk)
            detected: DetectionResult = self._detector.detect(chunk_dataframe, cache)
            detections.append(detected)

        if len(detections) == 0:
            raise RuntimeError(f'do_detect for {self.analytic_unit_id} got empty detection results')

        detection_result = self._detector.concat_detection_results(detections)
        return detection_result.to_json()

    def cancel(self):
        if self._training_future is not None:
            self._training_future.cancel()

    async def consume_data(self, data: TimeSeries, cache: Optional[ModelCache]) -> Optional[dict]:
        window_size = self._detector.get_window_size(cache)

        detections: List[DetectionResult] = []

        for chunk in get_chunks(data, window_size * self.CHUNK_WINDOW_SIZE_FACTOR):
            await asyncio.sleep(0)
            chunk_dataframe = prepare_data(chunk)
            detected = self._detector.consume_data(chunk_dataframe, cache)
            if detected is not None:
                detections.append(detected)

        if len(detections) == 0:
            return None
        else:
            detection_result = self._detector.concat_detection_results(detections)
            return detection_result.to_json()

    async def process_data(self, data: TimeSeries, cache: ModelCache) -> dict:
        assert isinstance(self._detector, detectors.ProcessingDetector), \
            f'{self.analytic_unit_id} detector is not ProcessingDetector, can`t process data'
        assert cache is not None, f'{self.analytic_unit_id} got empty cache for processing data'

        processed_chunks = []
        window_size = self._detector.get_window_size(cache)
        for chunk in get_chunks(data, window_size * self.CHUNK_WINDOW_SIZE_FACTOR):
            await asyncio.sleep(0)
            chunk_dataframe = prepare_data(chunk)
            processed = self._detector.process_data(chunk_dataframe, cache)
            if processed is not None:
                processed_chunks.append(processed)

        if len(processed_chunks) == 0:
            raise RuntimeError(f'process_data for {self.analytic_unit_id} got empty processing results')

        # TODO: maybe we should process all chunks inside of detector?
        result = self._detector.concat_processing_results(processed_chunks)
        return result.to_json()