import config import detectors import logging import pandas as pd from typing import Optional, Union, Generator, List, Tuple import concurrent.futures import asyncio import utils from utils import get_intersected_chunks, get_chunks, prepare_data from analytic_types import ModelCache, TimeSeries from analytic_types.detector import DetectionResult logger = logging.getLogger('AnalyticUnitWorker') class AnalyticUnitWorker: CHUNK_WINDOW_SIZE_FACTOR = 100 CHUNK_INTERSECTION_FACTOR = 2 assert CHUNK_WINDOW_SIZE_FACTOR > CHUNK_INTERSECTION_FACTOR, \ 'CHUNK_INTERSECTION_FACTOR should be less than CHUNK_WINDOW_SIZE_FACTOR' def __init__(self, analytic_unit_id: str, detector: detectors.Detector, executor: concurrent.futures.Executor): self.analytic_unit_id = analytic_unit_id self._detector = detector self._executor: concurrent.futures.Executor = executor self._training_future: asyncio.Future = None async def do_train( self, payload: Union[list, dict], data: TimeSeries, cache: Optional[ModelCache] ) -> Optional[ModelCache]: dataframe = prepare_data(data) cfuture: concurrent.futures.Future = self._executor.submit( self._detector.train, dataframe, payload, cache ) self._training_future = asyncio.wrap_future(cfuture) try: new_cache: ModelCache = await asyncio.wait_for(self._training_future, timeout = config.LEARNING_TIMEOUT) return new_cache except asyncio.CancelledError: return None except asyncio.TimeoutError: raise Exception('Timeout ({}s) exceeded while learning'.format(config.LEARNING_TIMEOUT)) async def do_detect(self, data: pd.DataFrame, cache: Optional[ModelCache]) -> DetectionResult: window_size = self._detector.get_window_size(cache) chunk_size = window_size * self.CHUNK_WINDOW_SIZE_FACTOR chunk_intersection = window_size * self.CHUNK_INTERSECTION_FACTOR detections: List[DetectionResult] = [] chunks = [] # XXX: get_chunks(data, chunk_size) == get_intersected_chunks(data, 0, chunk_size) if self._detector.is_detection_intersected(): chunks = get_intersected_chunks(data, chunk_intersection, chunk_size) else: chunks = get_chunks(data, chunk_size) for chunk in chunks: await asyncio.sleep(0) chunk_dataframe = prepare_data(chunk) detected: DetectionResult = self._detector.detect(chunk_dataframe, cache) detections.append(detected) if len(detections) == 0: raise RuntimeError(f'do_detect for {self.analytic_unit_id} got empty detection results') detection_result = self._detector.concat_detection_results(detections) return detection_result.to_json() def cancel(self): if self._training_future is not None: self._training_future.cancel() async def consume_data(self, data: TimeSeries, cache: Optional[ModelCache]) -> Optional[dict]: window_size = self._detector.get_window_size(cache) detections: List[DetectionResult] = [] for chunk in get_chunks(data, window_size * self.CHUNK_WINDOW_SIZE_FACTOR): await asyncio.sleep(0) chunk_dataframe = prepare_data(chunk) detected = self._detector.consume_data(chunk_dataframe, cache) if detected is not None: detections.append(detected) if len(detections) == 0: return None else: detection_result = self._detector.concat_detection_results(detections) return detection_result.to_json() async def process_data(self, data: TimeSeries, cache: ModelCache) -> dict: assert isinstance(self._detector, detectors.ProcessingDetector), \ f'{self.analytic_unit_id} detector is not ProcessingDetector, can`t process data' assert cache is not None, f'{self.analytic_unit_id} got empty cache for processing data' processed_chunks = [] window_size = self._detector.get_window_size(cache) for chunk in get_chunks(data, window_size * self.CHUNK_WINDOW_SIZE_FACTOR): await asyncio.sleep(0) chunk_dataframe = prepare_data(chunk) processed = self._detector.process_data(chunk_dataframe, cache) if processed is not None: processed_chunks.append(processed) if len(processed_chunks) == 0: raise RuntimeError(f'process_data for {self.analytic_unit_id} got empty processing results') # TODO: maybe we should process all chunks inside of detector? result = self._detector.concat_processing_results(processed_chunks) return result.to_json()