Browse Source

add src

master
CorpGlory Inc. 4 years ago
parent
commit
8734258c84
  1. 2
      .dockerignore
  2. 5
      .gitignore
  3. 1
      .vscode/.env
  4. 32
      .vscode/launch.json
  5. 22
      .vscode/settings.json
  6. 27
      Codestyle.md
  7. 12
      Dockerfile
  8. 13
      README.md
  9. 39
      analytics/analytic_types/__init__.py
  10. 38
      analytics/analytic_types/cache.py
  11. 14
      analytics/analytic_types/data_bucket.py
  12. 47
      analytics/analytic_types/detector.py
  13. 17
      analytics/analytic_types/learning_info.py
  14. 57
      analytics/analytic_types/segment.py
  15. 103
      analytics/analytic_unit_manager.py
  16. 116
      analytics/analytic_unit_worker.py
  17. 30
      analytics/config.py
  18. 4
      analytics/detectors/__init__.py
  19. 277
      analytics/detectors/anomaly_detector.py
  20. 80
      analytics/detectors/detector.py
  21. 147
      analytics/detectors/pattern_detector.py
  22. 111
      analytics/detectors/threshold_detector.py
  23. 9
      analytics/models/__init__.py
  24. 30
      analytics/models/custom_model.py
  25. 9
      analytics/models/drop_model.py
  26. 104
      analytics/models/general_model.py
  27. 9
      analytics/models/jump_model.py
  28. 230
      analytics/models/model.py
  29. 44
      analytics/models/peak_model.py
  30. 147
      analytics/models/stair_model.py
  31. 119
      analytics/models/triangle_model.py
  32. 44
      analytics/models/trough_model.py
  33. 94
      analytics/server.py
  34. 2
      analytics/services/__init__.py
  35. 85
      analytics/services/data_service.py
  36. 149
      analytics/services/server_service.py
  37. 4
      analytics/utils/__init__.py
  38. 443
      analytics/utils/common.py
  39. 130
      analytics/utils/concurrent.py
  40. 63
      analytics/utils/dataframe.py
  41. 81
      analytics/utils/meta.py
  42. 13
      analytics/utils/time.py
  43. 32
      bin/server
  44. 1
      pyinstaller_hooks/hook-pandas.py
  45. 1
      pyinstaller_hooks/hook-scipy.py
  46. 7
      requirements.txt
  47. 3
      scripts/build-dist.sh
  48. 4
      tests/__init__.py
  49. 16
      tests/test_analytic_types.py
  50. 38
      tests/test_bucket.py
  51. 386
      tests/test_dataset.py
  52. 265
      tests/test_detectors.py
  53. 100
      tests/test_manager.py
  54. 43
      tests/test_models.py
  55. 359
      tests/test_utils.py
  56. 43
      tests/test_utils_dataframe.py
  57. 122
      tools/analytic_model_tester.py
  58. 104
      tools/send_zmq_message.py

2
.dockerignore

@ -0,0 +1,2 @@
__pycache__
.vscode

5
.gitignore vendored

@ -0,0 +1,5 @@
build/
dist/
*.spec
__pycache__/
test/

1
.vscode/.env vendored

@ -0,0 +1 @@
PYTHONPATH=analytics

32
.vscode/launch.json vendored

@ -0,0 +1,32 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Attach (Remote Debug)",
"type": "python",
"request": "attach",
"port": 5679,
"host": "localhost",
"pathMappings": [
{
"localRoot": "${workspaceFolder}",
"remoteRoot": "/var/www/analytics"
}
]
},
{
"name": "Python: Current File",
"type": "python",
"request": "launch",
"windows": {
"program": "${workspaceFolder}\\bin\\server"
},
"linux": {
"program": "${workspaceFolder}/bin/server"
}
}
]
}

22
.vscode/settings.json vendored

@ -0,0 +1,22 @@
{
"terminal.integrated.shell.windows": "C:\\WINDOWS\\System32\\WindowsPowerShell\\v1.0\\powershell.exe",
"editor.insertSpaces": true,
"files.eol": "\n",
"files.exclude": {
"**/__pycache__/": true,
"dist": true,
"build": true
},
"[python]": {
"editor.tabSize": 4,
},
"python.envFile": "${workspaceFolder}/.vscode/.env",
"python.pythonPath": "python",
"python.linting.enabled": true,
"python.testing.unittestArgs": [ "-v" ],
"python.testing.pytestEnabled": false,
"python.testing.nosetestsEnabled": false,
"python.testing.unittestEnabled": true,
"python.linting.pylintEnabled": true,
"python.jediEnabled": false
}

27
Codestyle.md

@ -0,0 +1,27 @@
# Type hints
Please use: https://www.python.org/dev/peps/pep-0484/
# Line endings
We use LF everywhere
# Imports
You import local files first, than spesific liba and then standart libs.
So you import from something very scecific to something very common.
It allows you to pay attention on most important things from beginning.
```
from data_provider import DataProvider
from anomaly_model import AnomalyModel
from pattern_detection_model import PatternDetectionModel
import numpy as np
from scipy.signal import argrelextrema
import pickle
```

12
Dockerfile

@ -0,0 +1,12 @@
FROM python:3.6.6
COPY requirements.txt /requirements.txt
RUN pip install -r /requirements.txt
WORKDIR /var/www/analytics
COPY . /var/www/analytics/
CMD ["python", "-u", "bin/server"]

13
README.md

@ -1 +1,12 @@
# analytics
# Hastic-server-analytics
Python service which gets tasks from [hastic-server-node](https://github.com/hastic/hastic-server/tree/master/server) like
* trains statistical models
* detect patterns in time series data
## Arhitecture
The service uses [asyncio](https://docs.python.org/3/library/asyncio.html),
[concurrency](https://docs.python.org/3.6/library/concurrent.futures.html#module-concurrent.futures) and
[pyzmq](https://pyzmq.readthedocs.io/en/latest/).

39
analytics/analytic_types/__init__.py

@ -0,0 +1,39 @@
"""
It is the place where we put all classes and types
common for all analytics code
For example, if you write someting which is used
in analytic_unit_manager, it should be here.
If you create something spicific which is used only in one place,
like PatternDetectionCache, then it should not be here.
"""
import pandas as pd
from typing import Union, List, Tuple
AnalyticUnitId = str
ModelCache = dict
# TODO: explicit timestamp / value
TimeSeries = List[Tuple[int, float]]
"""
Example:
tsis = TimeSeriesIndex(['2017-12-31 16:00:00-08:00', '2017-12-31 17:00:00-08:00', '2017-12-31 18:00:00-08:00'])
ts = TimeSeries([4, 5, 6], tsis)
"""
Timestamp = Union[str, pd.Timestamp]
class TimeSeriesIndex(pd.DatetimeIndex):
def __new__(cls, *args, **kwargs):
return pd.DatetimeIndex.__new__(cls, *args, **kwargs)
# TODO: make generic type for values. See List definition for example of generic class
# TODO: constructor from DataFrame
# TODO: repleace TimeSeries (above) with this class: rename TimeSeries2 to TimeSeries
class TimeSeries2(pd.Series):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

38
analytics/analytic_types/cache.py

@ -0,0 +1,38 @@
from typing import Optional, List, Dict
from analytic_types.segment import AnomalyDetectorSegment
from analytic_types.detector import Bound
from utils.meta import JSONClass, SerializableList
@JSONClass
class AnomalyCache:
def __init__(
self,
alpha: float,
confidence: float,
enable_bounds: str,
seasonality: Optional[int] = None,
segments: Optional[List[Dict]] = None,
time_step: Optional[int] = None,
):
self.alpha = alpha
self.confidence = confidence
self.enable_bounds = enable_bounds
if seasonality != None and seasonality < 0:
raise ValueError(f'Can`t create AnomalyCache: got invalid seasonality {seasonality}')
self.seasonality = seasonality
self.time_step = time_step
if segments != None:
anomaly_segments = map(AnomalyDetectorSegment.from_json, segments)
self.segments = SerializableList(anomaly_segments)
else:
self.segments = []
def set_segments(self, segments: List[AnomalyDetectorSegment]):
if len(segments) > 0:
self.segments = SerializableList(segments)
def get_enabled_bounds(self) -> Bound:
#TODO: use class with to_json()
return Bound(self.enable_bounds)

14
analytics/analytic_types/data_bucket.py

@ -0,0 +1,14 @@
import pandas as pd
class DataBucket:
def __init__(self):
self.data = pd.DataFrame([], columns=['timestamp', 'value'])
def receive_data(self, data: pd.DataFrame):
self.data = self.data.append(data, ignore_index=True)
def drop_data(self, count: int):
if count > 0:
self.data = self.data.iloc[count:]

47
analytics/analytic_types/detector.py

@ -0,0 +1,47 @@
from analytic_types import ModelCache, TimeSeries
from analytic_types.segment import Segment
from enum import Enum
from typing import List, Optional, Tuple
import utils.meta
class Bound(Enum):
ALL = 'ALL'
UPPER = 'UPPER'
LOWER = 'LOWER'
class DetectionResult:
def __init__(
self,
cache: Optional[ModelCache] = None,
segments: Optional[List[Segment]] = None,
last_detection_time: int = None
):
if cache is None:
cache = {}
if segments is None:
segments = []
self.cache = cache
self.segments = segments
self.last_detection_time = last_detection_time
# TODO: use @utils.meta.JSONClass (now it can't serialize list of objects)
def to_json(self):
return {
'cache': self.cache,
'segments': list(map(lambda segment: segment.to_json(), self.segments)),
'lastDetectionTime': self.last_detection_time
}
@utils.meta.JSONClass
class ProcessingResult():
def __init__(
self,
lower_bound: Optional[TimeSeries] = None,
upper_bound: Optional[TimeSeries] = None,
):
self.lower_bound = lower_bound
self.upper_bound = upper_bound

17
analytics/analytic_types/learning_info.py

@ -0,0 +1,17 @@
import utils.meta
@utils.meta.JSONClass
class LearningInfo:
def __init__(self):
super().__init__()
self.confidence = []
self.patterns_list = []
self.pattern_width = []
self.pattern_height = []
self.pattern_timestamp = []
self.segment_center_list = []
self.patterns_value = []
def __str__(self):
return str(self.to_json())

57
analytics/analytic_types/segment.py

@ -0,0 +1,57 @@
from typing import Optional
import utils.meta
@utils.meta.JSONClass
class Segment:
'''
Used for segment manipulation instead of { 'from': ..., 'to': ... } dict
'''
def __init__(
self,
from_timestamp: int,
to_timestamp: int,
_id: Optional[str] = None,
analytic_unit_id: Optional[str] = None,
labeled: Optional[bool] = None,
deleted: Optional[bool] = None,
message: Optional[str] = None
):
if to_timestamp < from_timestamp:
raise ValueError(f'Can`t create segment with to < from: {to_timestamp} < {from_timestamp}')
self.from_timestamp = from_timestamp
self.to_timestamp = to_timestamp
self._id = _id
self.analytic_unit_id = analytic_unit_id
self.labeled = labeled
self.deleted = deleted
self.message = message
@utils.meta.JSONClass
class AnomalyDetectorSegment(Segment):
'''
Used for segment manipulation instead of { 'from': ..., 'to': ..., 'data': ... } dict
'''
def __init__(
self,
from_timestamp: int,
to_timestamp: int,
data = [],
_id: Optional[str] = None,
analytic_unit_id: Optional[str] = None,
labeled: Optional[bool] = None,
deleted: Optional[bool] = None,
message: Optional[str] = None
):
super().__init__(
from_timestamp,
to_timestamp,
_id,
analytic_unit_id,
labeled,
deleted,
message
)
self.data = data

103
analytics/analytic_unit_manager.py

@ -0,0 +1,103 @@
from typing import Dict
import logging as log
import traceback
from concurrent.futures import Executor, ThreadPoolExecutor
from analytic_unit_worker import AnalyticUnitWorker
from analytic_types import AnalyticUnitId, ModelCache
from analytic_types.segment import Segment
import detectors
logger = log.getLogger('AnalyticUnitManager')
def get_detector_by_type(
detector_type: str, analytic_unit_type: str, analytic_unit_id: AnalyticUnitId
) -> detectors.Detector:
if detector_type == 'pattern':
return detectors.PatternDetector(analytic_unit_type, analytic_unit_id)
elif detector_type == 'threshold':
return detectors.ThresholdDetector(analytic_unit_id)
elif detector_type == 'anomaly':
return detectors.AnomalyDetector(analytic_unit_id)
raise ValueError('Unknown detector type "%s"' % detector_type)
class AnalyticUnitManager:
def __init__(self):
self.analytic_workers: Dict[AnalyticUnitId, AnalyticUnitWorker] = dict()
self.workers_executor = ThreadPoolExecutor()
def __ensure_worker(
self,
analytic_unit_id: AnalyticUnitId,
detector_type: str,
analytic_unit_type: str
) -> AnalyticUnitWorker:
if analytic_unit_id in self.analytic_workers:
# TODO: check that type is the same
return self.analytic_workers[analytic_unit_id]
detector = get_detector_by_type(detector_type, analytic_unit_type, analytic_unit_id)
worker = AnalyticUnitWorker(analytic_unit_id, detector, self.workers_executor)
self.analytic_workers[analytic_unit_id] = worker
return worker
async def __handle_analytic_task(self, task: object) -> dict:
"""
returns payload or None
"""
analytic_unit_id: AnalyticUnitId = task['analyticUnitId']
log.debug('Analytics get task with type: {} for unit: {}'.format(task['type'], analytic_unit_id))
if task['type'] == 'CANCEL':
if analytic_unit_id in self.analytic_workers:
self.analytic_workers[analytic_unit_id].cancel()
return
payload = task['payload']
worker = self.__ensure_worker(analytic_unit_id, payload['detector'], payload['analyticUnitType'])
data = payload.get('data')
if task['type'] == 'PUSH':
# TODO: do it a better way
res = await worker.consume_data(data, payload['cache'])
if res:
res.update({ 'analyticUnitId': analytic_unit_id })
return res
elif task['type'] == 'LEARN':
if 'segments' in payload:
segments = payload['segments']
segments = [Segment.from_json(segment) for segment in segments]
return await worker.do_train(segments, data, payload['cache'])
elif 'threshold' in payload:
return await worker.do_train(payload['threshold'], data, payload['cache'])
elif 'anomaly' in payload:
return await worker.do_train(payload['anomaly'], data, payload['cache'])
else:
raise ValueError('No segments or threshold in LEARN payload')
elif task['type'] == 'DETECT':
return await worker.do_detect(data, payload['cache'])
elif task['type'] == 'PROCESS':
return await worker.process_data(data, payload['cache'])
raise ValueError('Unknown task type "%s"' % task['type'])
async def handle_analytic_task(self, task: object):
try:
log.debug('Start handle_analytic_task with analytic unit: {}'.format(task['analyticUnitId']))
result_payload = await self.__handle_analytic_task(task)
result_message = {
'status': 'SUCCESS',
'payload': result_payload
}
log.debug('End correctly handle_analytic_task with anatytic unit: {}'.format(task['analyticUnitId']))
return result_message
except Exception as e:
error_text = traceback.format_exc()
logger.error("handle_analytic_task Exception: '%s'" % error_text)
# TODO: move result to a class which renders to json for messaging to analytics
return {
'status': 'FAILED',
'error': repr(e)
}

116
analytics/analytic_unit_worker.py

@ -0,0 +1,116 @@
import config
import detectors
import logging
import pandas as pd
from typing import Optional, Union, Generator, List, Tuple
import concurrent.futures
import asyncio
import utils
from utils import get_intersected_chunks, get_chunks, prepare_data
from analytic_types import ModelCache, TimeSeries
from analytic_types.detector import DetectionResult
logger = logging.getLogger('AnalyticUnitWorker')
class AnalyticUnitWorker:
CHUNK_WINDOW_SIZE_FACTOR = 100
CHUNK_INTERSECTION_FACTOR = 2
assert CHUNK_WINDOW_SIZE_FACTOR > CHUNK_INTERSECTION_FACTOR, \
'CHUNK_INTERSECTION_FACTOR should be less than CHUNK_WINDOW_SIZE_FACTOR'
def __init__(self, analytic_unit_id: str, detector: detectors.Detector, executor: concurrent.futures.Executor):
self.analytic_unit_id = analytic_unit_id
self._detector = detector
self._executor: concurrent.futures.Executor = executor
self._training_future: asyncio.Future = None
async def do_train(
self, payload: Union[list, dict], data: TimeSeries, cache: Optional[ModelCache]
) -> Optional[ModelCache]:
dataframe = prepare_data(data)
cfuture: concurrent.futures.Future = self._executor.submit(
self._detector.train, dataframe, payload, cache
)
self._training_future = asyncio.wrap_future(cfuture)
try:
new_cache: ModelCache = await asyncio.wait_for(self._training_future, timeout = config.LEARNING_TIMEOUT)
return new_cache
except asyncio.CancelledError:
return None
except asyncio.TimeoutError:
raise Exception('Timeout ({}s) exceeded while learning'.format(config.LEARNING_TIMEOUT))
async def do_detect(self, data: pd.DataFrame, cache: Optional[ModelCache]) -> DetectionResult:
window_size = self._detector.get_window_size(cache)
chunk_size = window_size * self.CHUNK_WINDOW_SIZE_FACTOR
chunk_intersection = window_size * self.CHUNK_INTERSECTION_FACTOR
detections: List[DetectionResult] = []
chunks = []
# XXX: get_chunks(data, chunk_size) == get_intersected_chunks(data, 0, chunk_size)
if self._detector.is_detection_intersected():
chunks = get_intersected_chunks(data, chunk_intersection, chunk_size)
else:
chunks = get_chunks(data, chunk_size)
for chunk in chunks:
await asyncio.sleep(0)
chunk_dataframe = prepare_data(chunk)
detected: DetectionResult = self._detector.detect(chunk_dataframe, cache)
detections.append(detected)
if len(detections) == 0:
raise RuntimeError(f'do_detect for {self.analytic_unit_id} got empty detection results')
detection_result = self._detector.concat_detection_results(detections)
return detection_result.to_json()
def cancel(self):
if self._training_future is not None:
self._training_future.cancel()
async def consume_data(self, data: TimeSeries, cache: Optional[ModelCache]) -> Optional[dict]:
window_size = self._detector.get_window_size(cache)
detections: List[DetectionResult] = []
for chunk in get_chunks(data, window_size * self.CHUNK_WINDOW_SIZE_FACTOR):
await asyncio.sleep(0)
chunk_dataframe = prepare_data(chunk)
detected = self._detector.consume_data(chunk_dataframe, cache)
if detected is not None:
detections.append(detected)
if len(detections) == 0:
return None
else:
detection_result = self._detector.concat_detection_results(detections)
return detection_result.to_json()
async def process_data(self, data: TimeSeries, cache: ModelCache) -> dict:
assert isinstance(self._detector, detectors.ProcessingDetector), \
f'{self.analytic_unit_id} detector is not ProcessingDetector, can`t process data'
assert cache is not None, f'{self.analytic_unit_id} got empty cache for processing data'
processed_chunks = []
window_size = self._detector.get_window_size(cache)
for chunk in get_chunks(data, window_size * self.CHUNK_WINDOW_SIZE_FACTOR):
await asyncio.sleep(0)
chunk_dataframe = prepare_data(chunk)
processed = self._detector.process_data(chunk_dataframe, cache)
if processed is not None:
processed_chunks.append(processed)
if len(processed_chunks) == 0:
raise RuntimeError(f'process_data for {self.analytic_unit_id} got empty processing results')
# TODO: maybe we should process all chunks inside of detector?
result = self._detector.concat_processing_results(processed_chunks)
return result.to_json()

30
analytics/config.py

@ -0,0 +1,30 @@
import os
import json
PARENT_FOLDER = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
CONFIG_FILE = os.path.join(PARENT_FOLDER, 'config.json')
config_exists = os.path.isfile(CONFIG_FILE)
if config_exists:
with open(CONFIG_FILE) as f:
config = json.load(f)
else:
print('Config file %s doesn`t exist, using defaults' % CONFIG_FILE)
def get_config_field(field: str, default_val = None):
if field in os.environ:
return os.environ[field]
if config_exists and field in config and config[field] != '':
return config[field]
if default_val is not None:
return default_val
raise Exception('Please configure {}'.format(field))
HASTIC_SERVER_URL = get_config_field('HASTIC_SERVER_URL', 'ws://localhost:8002')
LEARNING_TIMEOUT = get_config_field('LEARNING_TIMEOUT', 120)

4
analytics/detectors/__init__.py

@ -0,0 +1,4 @@
from detectors.detector import Detector, ProcessingDetector
from detectors.pattern_detector import PatternDetector
from detectors.threshold_detector import ThresholdDetector
from detectors.anomaly_detector import AnomalyDetector

277
analytics/detectors/anomaly_detector.py

@ -0,0 +1,277 @@
from enum import Enum
import logging
import numpy as np
import pandas as pd
import math
from typing import Optional, Union, List, Tuple, Generator
import operator
from analytic_types import AnalyticUnitId, ModelCache
from analytic_types.detector import DetectionResult, ProcessingResult, Bound
from analytic_types.data_bucket import DataBucket
from analytic_types.segment import Segment, AnomalyDetectorSegment
from analytic_types.cache import AnomalyCache
from detectors import Detector, ProcessingDetector
import utils
MAX_DEPENDENCY_LEVEL = 100
MIN_DEPENDENCY_FACTOR = 0.1
BASIC_ALPHA = 0.5
logger = logging.getLogger('ANOMALY_DETECTOR')
class AnomalyDetector(ProcessingDetector):
def __init__(self, analytic_unit_id: AnalyticUnitId):
super().__init__(analytic_unit_id)
self.bucket = DataBucket()
def train(self, dataframe: pd.DataFrame, payload: Union[list, dict], cache: Optional[ModelCache]) -> ModelCache:
cache = AnomalyCache.from_json(payload)
cache.time_step = utils.find_interval(dataframe)
segments = cache.segments
if len(segments) > 0:
seasonality = cache.seasonality
prepared_segments = []
for segment in segments:
segment_len = (int(segment.to_timestamp) - int(segment.from_timestamp))
assert segment_len <= seasonality, \
f'seasonality {seasonality} must be greater than segment length {segment_len}'
from_index = utils.timestamp_to_index(dataframe, pd.to_datetime(segment.from_timestamp, unit='ms'))
to_index = utils.timestamp_to_index(dataframe, pd.to_datetime(segment.to_timestamp, unit='ms'))
segment_data = dataframe[from_index : to_index]
prepared_segments.append(
AnomalyDetectorSegment(
segment.from_timestamp,
segment.to_timestamp,
segment_data.value.tolist()
)
)
cache.set_segments(prepared_segments)
return {
'cache': cache.to_json()
}
# TODO: ModelCache -> DetectorState
def detect(self, dataframe: pd.DataFrame, cache: Optional[ModelCache]) -> DetectionResult:
if cache == None:
raise f'Analytic unit {self.analytic_unit_id} got empty cache'
data = dataframe['value']
cache = AnomalyCache.from_json(cache)
segments = cache.segments
enabled_bounds = cache.get_enabled_bounds()
smoothed_data = utils.exponential_smoothing(data, cache.alpha)
lower_bound = smoothed_data - cache.confidence
upper_bound = smoothed_data + cache.confidence
if len(segments) > 0:
data_start_time = utils.convert_pd_timestamp_to_ms(dataframe['timestamp'][0])
for segment in segments:
seasonality_index = cache.seasonality // cache.time_step
seasonality_offset = self.get_seasonality_offset(
segment.from_timestamp,
cache.seasonality,
data_start_time,
cache.time_step
)
segment_data = pd.Series(segment.data)
lower_bound = self.add_season_to_data(lower_bound, segment_data, seasonality_offset, seasonality_index, Bound.LOWER)
upper_bound = self.add_season_to_data(upper_bound, segment_data, seasonality_offset, seasonality_index, Bound.UPPER)
detected_segments = list(self.detections_generator(dataframe, upper_bound, lower_bound, enabled_bounds))
last_dataframe_time = dataframe.iloc[-1]['timestamp']
last_detection_time = utils.convert_pd_timestamp_to_ms(last_dataframe_time)
return DetectionResult(cache.to_json(), detected_segments, last_detection_time)
def consume_data(self, data: pd.DataFrame, cache: Optional[ModelCache]) -> Optional[DetectionResult]:
if cache is None:
msg = f'consume_data got invalid cache {cache} for task {self.analytic_unit_id}'
logging.debug(msg)
raise ValueError(msg)
data_without_nan = data.dropna()
if len(data_without_nan) == 0:
return None
self.bucket.receive_data(data_without_nan)
if len(self.bucket.data) >= self.get_window_size(cache):
return self.detect(self.bucket.data, cache)
return None
def is_detection_intersected(self) -> bool:
return False
def get_window_size(self, cache: Optional[ModelCache]) -> int:
'''
get the number of values that will affect the next value
'''
if cache is None:
raise ValueError('anomaly detector got None cache')
cache = AnomalyCache.from_json(cache)
for level in range(1, MAX_DEPENDENCY_LEVEL):
if (1 - cache.alpha) ** level < MIN_DEPENDENCY_FACTOR:
break
seasonality = 0
if len(cache.segments) > 0:
seasonality = cache.seasonality // cache.time_step
return max(level, seasonality)
def concat_detection_results(self, detections: List[DetectionResult]) -> DetectionResult:
result = DetectionResult()
time_step = detections[0].cache['timeStep']
for detection in detections:
result.segments.extend(detection.segments)
result.last_detection_time = detection.last_detection_time
result.cache = detection.cache
result.segments = utils.merge_intersecting_segments(result.segments, time_step)
return result
# TODO: remove duplication with detect()
def process_data(self, dataframe: pd.DataFrame, cache: ModelCache) -> ProcessingResult:
cache = AnomalyCache.from_json(cache)
segments = cache.segments
enabled_bounds = cache.get_enabled_bounds()
# TODO: exponential_smoothing should return dataframe with related timestamps
smoothed_data = utils.exponential_smoothing(dataframe['value'], cache.alpha)
lower_bound = smoothed_data - cache.confidence
upper_bound = smoothed_data + cache.confidence
if len(segments) > 0:
data_start_time = utils.convert_pd_timestamp_to_ms(dataframe['timestamp'][0])
for segment in segments:
seasonality_index = cache.seasonality // cache.time_step
# TODO: move it to utils and add tests
seasonality_offset = self.get_seasonality_offset(
segment.from_timestamp,
cache.seasonality,
data_start_time,
cache.time_step
)
segment_data = pd.Series(segment.data)
lower_bound = self.add_season_to_data(lower_bound, segment_data, seasonality_offset, seasonality_index, Bound.LOWER)
upper_bound = self.add_season_to_data(upper_bound, segment_data, seasonality_offset, seasonality_index, Bound.UPPER)
# TODO: support multiple segments
timestamps = utils.convert_series_to_timestamp_list(dataframe.timestamp)
lower_bound_timeseries = list(zip(timestamps, lower_bound.values.tolist()))
upper_bound_timeseries = list(zip(timestamps, upper_bound.values.tolist()))
if enabled_bounds == Bound.ALL:
return ProcessingResult(lower_bound_timeseries, upper_bound_timeseries)
elif enabled_bounds == Bound.UPPER:
return ProcessingResult(upper_bound = upper_bound_timeseries)
elif enabled_bounds == Bound.LOWER:
return ProcessingResult(lower_bound = lower_bound_timeseries)
def add_season_to_data(self, data: pd.Series, segment: pd.Series, offset: int, seasonality: int, bound_type: Bound) -> pd.Series:
#data - smoothed data to which seasonality will be added
#if addition == True -> segment is added
#if addition == False -> segment is subtracted
len_smoothed_data = len(data)
for idx, _ in enumerate(data):
if idx - offset < 0:
#TODO: add seasonality for non empty parts
continue
if (idx - offset) % seasonality == 0:
if bound_type == Bound.UPPER:
upper_segment_bound = self.get_segment_bound(segment, Bound.UPPER)
data = data.add(pd.Series(upper_segment_bound.values, index = segment.index + idx), fill_value = 0)
elif bound_type == Bound.LOWER:
lower_segment_bound = self.get_segment_bound(segment, Bound.LOWER)
data = data.add(pd.Series(lower_segment_bound.values * -1, index = segment.index + idx), fill_value = 0)
else:
raise ValueError(f'unknown bound type: {bound_type.value}')
return data[:len_smoothed_data]
def get_segment_bound(self, segment: pd.Series, bound: Bound) -> pd.Series:
'''
segment is divided by the median to determine its top or bottom part
the part is smoothed and raised above the segment or put down below the segment
'''
if len(segment) < 2:
return segment
comparison_operator = operator.gt if bound == Bound.UPPER else operator.le
segment = segment - segment.min()
segment_median = segment.median()
part = [val if comparison_operator(val, segment_median) else segment_median for val in segment.values]
part = pd.Series(part, index = segment.index)
smoothed_part = utils.exponential_smoothing(part, BASIC_ALPHA)
difference = [abs(x - y) for x, y in zip(part, smoothed_part)]
max_diff = max(difference)
bound = [val + max_diff for val in smoothed_part.values]
bound = pd.Series(bound, index = segment.index)
return bound
def get_seasonality_offset(self, from_timestamp: int, seasonality: int, data_start_time: int, time_step: int) -> int:
season_count = math.ceil(abs(from_timestamp - data_start_time) / seasonality)
start_seasonal_segment = from_timestamp + seasonality * season_count
seasonality_time_offset = abs(start_seasonal_segment - data_start_time) % seasonality
seasonality_offset = math.ceil(seasonality_time_offset / time_step)
return seasonality_offset
def detections_generator(
self,
dataframe: pd.DataFrame,
upper_bound: pd.DataFrame,
lower_bound: pd.DataFrame,
enabled_bounds: Bound
) -> Generator[Segment, None, Segment]:
in_segment = False
segment_start = 0
bound: Bound = None
for idx, val in enumerate(dataframe['value'].values):
if val > upper_bound.values[idx]:
if enabled_bounds == Bound.UPPER or enabled_bounds == Bound.ALL:
if not in_segment:
in_segment = True
segment_start = dataframe['timestamp'][idx]
bound = Bound.UPPER
continue
if val < lower_bound.values[idx]:
if enabled_bounds == Bound.LOWER or enabled_bounds == Bound.ALL:
if not in_segment:
in_segment = True
segment_start = dataframe['timestamp'][idx]
bound = Bound.LOWER
continue
if in_segment:
segment_end = dataframe['timestamp'][idx - 1]
yield Segment(
utils.convert_pd_timestamp_to_ms(segment_start),
utils.convert_pd_timestamp_to_ms(segment_end),
message=f'{val} out of {str(bound.value)} bound'
)
in_segment = False
else:
if in_segment:
segment_end = dataframe['timestamp'][idx]
return Segment(
utils.convert_pd_timestamp_to_ms(segment_start),
utils.convert_pd_timestamp_to_ms(segment_end),
message=f'{val} out of {str(bound.value)} bound'
)

80
analytics/detectors/detector.py

@ -0,0 +1,80 @@
from abc import ABC, abstractmethod
from pandas import DataFrame
from typing import Optional, Union, List
from analytic_types import ModelCache, TimeSeries, AnalyticUnitId
from analytic_types.detector import DetectionResult, ProcessingResult
from analytic_types.segment import Segment
class Detector(ABC):
def __init__(self, analytic_unit_id: AnalyticUnitId):
self.analytic_unit_id = analytic_unit_id
@abstractmethod
def train(self, dataframe: DataFrame, payload: Union[list, dict], cache: Optional[ModelCache]) -> ModelCache:
"""
Should be thread-safe to other detectors' train method
"""
pass
@abstractmethod
def detect(self, dataframe: DataFrame, cache: Optional[ModelCache]) -> DetectionResult:
pass
@abstractmethod
def consume_data(self, data: DataFrame, cache: Optional[ModelCache]) -> Optional[DetectionResult]:
pass
@abstractmethod
def get_window_size(self, cache: Optional[ModelCache]) -> int:
pass
def is_detection_intersected(self) -> bool:
return True
def concat_detection_results(self, detections: List[DetectionResult]) -> DetectionResult:
result = DetectionResult()
for detection in detections:
result.segments.extend(detection.segments)
result.last_detection_time = detection.last_detection_time
result.cache = detection.cache
return result
def get_value_from_cache(self, cache: ModelCache, key: str, required = False):
value = cache.get(key)
if value == None and required:
raise ValueError(f'Missing required "{key}" field in cache for analytic unit {self.analytic_unit_id}')
return value
class ProcessingDetector(Detector):
@abstractmethod
def process_data(self, data: TimeSeries, cache: Optional[ModelCache]) -> ProcessingResult:
'''
Data processing to receive additional time series that represents detector's settings
'''
pass
def concat_processing_results(self, processing_results: List[ProcessingResult]) -> Optional[ProcessingResult]:
'''
Concatenate sequential ProcessingResults that received via
splitting dataset to chunks in analytic worker
'''
if len(processing_results) == 0:
return None
united_result = ProcessingResult()
for result in processing_results:
if result.lower_bound is not None:
if united_result.lower_bound is None: united_result.lower_bound = []
united_result.lower_bound.extend(result.lower_bound)
if result.upper_bound is not None:
if united_result.upper_bound is None: united_result.upper_bound = []
united_result.upper_bound.extend(result.upper_bound)
return united_result

147
analytics/detectors/pattern_detector.py

@ -0,0 +1,147 @@
import models
import asyncio
import logging
import config
import pandas as pd
from typing import Optional, Generator, List
from detectors import Detector
from analytic_types.data_bucket import DataBucket
from utils import convert_pd_timestamp_to_ms
from analytic_types import AnalyticUnitId, ModelCache
from analytic_types.detector import DetectionResult
from analytic_types.segment import Segment
import utils
logger = logging.getLogger('PATTERN_DETECTOR')
def resolve_model_by_pattern(pattern: str) -> models.Model:
if pattern == 'GENERAL':
return models.GeneralModel()
if pattern == 'PEAK':
return models.PeakModel()
if pattern == 'TROUGH':
return models.TroughModel()
if pattern == 'DROP':
return models.DropModel()
if pattern == 'JUMP':
return models.JumpModel()
if pattern == 'CUSTOM':
return models.CustomModel()
raise ValueError('Unknown pattern "%s"' % pattern)
class PatternDetector(Detector):
MIN_BUCKET_SIZE = 150
BUCKET_WINDOW_SIZE_FACTOR = 5
DEFAULT_WINDOW_SIZE = 1
def __init__(self, pattern_type: str, analytic_unit_id: AnalyticUnitId):
super().__init__(analytic_unit_id)
self.pattern_type = pattern_type
self.model = resolve_model_by_pattern(self.pattern_type)
self.bucket = DataBucket()
def train(self, dataframe: pd.DataFrame, segments: List[Segment], cache: Optional[ModelCache]) -> ModelCache:
# TODO: pass only part of dataframe that has segments
if self.contains_labeled_segments(segments) == False:
msg = f'{self.analytic_unit_id} has no positive labeled segments. Pattern detector needs at least 1 positive labeled segment'
logger.error(msg)
raise ValueError(msg)
self.model.state: models.ModelState = self.model.get_state(cache)
new_cache: models.ModelState = self.model.fit(dataframe, segments, self.analytic_unit_id)
# time step is optional
if len(dataframe) > 1:
new_cache.time_step = utils.find_interval(dataframe)
new_cache = new_cache.to_json()
if len(new_cache) == 0:
logging.warning('new_cache is empty with data: {}, segments: {}, cache: {}, analytic unit: {}'.format(dataframe, segments, cache, self.analytic_unit_id))
return {
'cache': new_cache
}
def detect(self, dataframe: pd.DataFrame, cache: Optional[ModelCache]) -> DetectionResult:
logger.debug('Unit {} got {} data points for detection'.format(self.analytic_unit_id, len(dataframe)))
# TODO: split and sleep (https://github.com/hastic/hastic-server/pull/124#discussion_r214085643)
if cache is None:
msg = f'{self.analytic_unit_id} detection got invalid cache, skip detection'
logger.error(msg)
raise ValueError(msg)
self.model.state = self.model.get_state(cache)
window_size = self.model.state.window_size
if window_size is None:
message = '{} got cache without window_size for detection'.format(self.analytic_unit_id)
logger.error(message)
raise ValueError(message)
if len(dataframe) < window_size * 2:
message = f'{self.analytic_unit_id} skip detection: dataset length {len(dataframe)} points less than minimal length {window_size * 2} points'
logger.error(message)
raise ValueError(message)
detected = self.model.detect(dataframe, self.analytic_unit_id)
segments = [Segment(segment[0], segment[1]) for segment in detected['segments']]
new_cache = detected['cache'].to_json()
last_dataframe_time = dataframe.iloc[-1]['timestamp']
last_detection_time = convert_pd_timestamp_to_ms(last_dataframe_time)
return DetectionResult(new_cache, segments, last_detection_time)
def consume_data(self, data: pd.DataFrame, cache: Optional[ModelCache]) -> Optional[DetectionResult]:
logging.debug('Start consume_data for analytic unit {}'.format(self.analytic_unit_id))
if cache is None:
logging.debug(f'consume_data get invalid cache {cache} for task {self.analytic_unit_id}, skip')
return None
data_without_nan = data.dropna()
if len(data_without_nan) == 0:
return None
self.bucket.receive_data(data_without_nan)
# TODO: use ModelState
window_size = cache['windowSize']
bucket_len = len(self.bucket.data)
if bucket_len < window_size * 2:
msg = f'{self.analytic_unit_id} bucket data {bucket_len} less than two window size {window_size * 2}, skip run detection from consume_data'
logger.debug(msg)
return None
res = self.detect(self.bucket.data, cache)
bucket_size = max(window_size * self.BUCKET_WINDOW_SIZE_FACTOR, self.MIN_BUCKET_SIZE)
if bucket_len > bucket_size:
excess_data = bucket_len - bucket_size
self.bucket.drop_data(excess_data)
logging.debug('End consume_data for analytic unit: {} with res: {}'.format(self.analytic_unit_id, str(res.to_json())))
if res:
return res
else:
return None
def get_window_size(self, cache: Optional[ModelCache]) -> int:
if cache is None: return self.DEFAULT_WINDOW_SIZE
# TODO: windowSize -> window_size
return cache.get('windowSize', self.DEFAULT_WINDOW_SIZE)
def contains_labeled_segments(self, segments: List[Segment]) -> bool:
for segment in segments:
if segment.labeled == True:
return True
return False

111
analytics/detectors/threshold_detector.py

@ -0,0 +1,111 @@
import logging as log
import operator
import pandas as pd
import numpy as np
from typing import Optional, List
from analytic_types import ModelCache, AnalyticUnitId
from analytic_types.detector import DetectionResult, ProcessingResult
from analytic_types.segment import Segment
from detectors import ProcessingDetector
from time import time
import utils
logger = log.getLogger('THRESHOLD_DETECTOR')
class ThresholdDetector(ProcessingDetector):
WINDOW_SIZE = 3
def __init__(self, analytic_unit_id: AnalyticUnitId):
super().__init__(analytic_unit_id)
def train(self, dataframe: pd.DataFrame, threshold: dict, cache: Optional[ModelCache]) -> ModelCache:
time_step = utils.find_interval(dataframe)
return {
'cache': {
'value': threshold['value'],
'condition': threshold['condition'],
'timeStep': time_step
}
}
def detect(self, dataframe: pd.DataFrame, cache: ModelCache) -> DetectionResult:
if cache is None or cache == {}:
raise ValueError('Threshold detector error: cannot detect before learning')
if len(dataframe) == 0:
return None
value = cache['value']
condition = cache['condition']
segments = []
for index, row in dataframe.iterrows():
current_value = row['value']
current_timestamp = utils.convert_pd_timestamp_to_ms(row['timestamp'])
segment = Segment(current_timestamp, current_timestamp)
# TODO: merge segments
if pd.isnull(current_value):
if condition == 'NO_DATA':
segment.message = 'NO_DATA detected'
segments.append(segment)
continue
comparators = {
'>': operator.gt,
'<': operator.lt,
'=': operator.eq,
'>=': operator.ge,
'<=': operator.le
}
assert condition in comparators.keys(), f'condition {condition} not allowed'
if comparators[condition](current_value, value):
segment.message = f"{current_value} {condition} threshold's value {value}"
segments.append(segment)
last_entry = dataframe.iloc[-1]
last_detection_time = utils.convert_pd_timestamp_to_ms(last_entry['timestamp'])
return DetectionResult(cache, segments, last_detection_time)
def consume_data(self, data: pd.DataFrame, cache: Optional[ModelCache]) -> Optional[DetectionResult]:
result = self.detect(data, cache)
return result if result else None
def get_window_size(self, cache: Optional[ModelCache]) -> int:
return self.WINDOW_SIZE
def concat_detection_results(self, detections: List[DetectionResult]) -> DetectionResult:
result = DetectionResult()
time_step = detections[0].cache['timeStep']
for detection in detections:
result.segments.extend(detection.segments)
result.last_detection_time = detection.last_detection_time
result.cache = detection.cache
result.segments = utils.merge_intersecting_segments(result.segments, time_step)
return result
def process_data(self, dataframe: pd.DataFrame, cache: ModelCache) -> ProcessingResult:
data = dataframe['value']
value = self.get_value_from_cache(cache, 'value', required = True)
condition = self.get_value_from_cache(cache, 'condition', required = True)
if condition == 'NO_DATA':
return ProcessingResult()
data.values[:] = value
timestamps = utils.convert_series_to_timestamp_list(dataframe.timestamp)
result_series = list(zip(timestamps, data.values.tolist()))
if condition in ['>', '>=', '=']:
return ProcessingResult(upper_bound = result_series)
if condition in ['<', '<=']:
return ProcessingResult(lower_bound = result_series)
raise ValueError(f'{condition} condition not supported')

9
analytics/models/__init__.py

@ -0,0 +1,9 @@
from models.model import Model, ModelState, AnalyticSegment, ModelType, ExtremumType
from models.triangle_model import TriangleModel, TriangleModelState
from models.stair_model import StairModel, StairModelState
from models.drop_model import DropModel
from models.peak_model import PeakModel
from models.jump_model import JumpModel
from models.custom_model import CustomModel
from models.trough_model import TroughModel
from models.general_model import GeneralModel, GeneralModelState

30
analytics/models/custom_model.py

@ -0,0 +1,30 @@
from models import Model, AnalyticSegment, ModelState, ModelType
from analytic_types import AnalyticUnitId, ModelCache
from analytic_types.learning_info import LearningInfo
import utils
import pandas as pd
from typing import List, Optional
class CustomModel(Model):
def do_fit(
self,
dataframe: pd.DataFrame,
labeled_segments: List[AnalyticSegment],
deleted_segments: List[AnalyticSegment],
learning_info: LearningInfo
) -> None:
pass
def do_detect(self, dataframe: pd.DataFrame) -> list:
return []
def find_segment_center(self, dataframe: pd.DataFrame, start: int, end: int) -> int:
pass
def get_model_type(self) -> ModelType:
pass
def get_state(self, cache: Optional[ModelCache] = None) -> ModelState:
pass

9
analytics/models/drop_model.py

@ -0,0 +1,9 @@
from models import StairModel, ModelType, ExtremumType
class DropModel(StairModel):
def get_model_type(self) -> ModelType:
return ModelType.DROP
def get_extremum_type(self) -> ExtremumType:
return ExtremumType.MIN

104
analytics/models/general_model.py

@ -0,0 +1,104 @@
from analytic_types import AnalyticUnitId
from models import Model, ModelState, AnalyticSegment, ModelType
from typing import Union, List, Generator
import utils
import utils.meta
import numpy as np
import pandas as pd
import scipy.signal
from scipy.fftpack import fft
from scipy.signal import argrelextrema
from scipy.stats.stats import pearsonr
from scipy.stats import gaussian_kde
from scipy.stats import norm
import logging
from typing import Optional, List, Tuple
import math
from analytic_types import AnalyticUnitId, TimeSeries
from analytic_types.learning_info import LearningInfo
PEARSON_FACTOR = 0.7
@utils.meta.JSONClass
class GeneralModelState(ModelState):
def __init__(self, **kwargs):
super().__init__(**kwargs)
class GeneralModel(Model):
def get_model_type(self) -> ModelType:
return ModelType.GENERAL
def find_segment_center(self, dataframe: pd.DataFrame, start: int, end: int) -> int:
data = dataframe['value']
segment = data[start: end]
center_ind = start + math.ceil((end - start) / 2)
return center_ind
def get_state(self, cache: Optional[dict] = None) -> GeneralModelState:
return GeneralModelState.from_json(cache)
def do_fit(
self,
dataframe: pd.DataFrame,
labeled_segments: List[AnalyticSegment],
deleted_segments: List[AnalyticSegment],
learning_info: LearningInfo
) -> None:
data = utils.cut_dataframe(dataframe)
data = data['value']
last_pattern_center = self.state.pattern_center
self.state.pattern_center = utils.remove_duplicates_and_sort(last_pattern_center + learning_info.segment_center_list)
self.state.pattern_model = utils.get_av_model(learning_info.patterns_list)
convolve_list = utils.get_convolve(self.state.pattern_center, self.state.pattern_model, data, self.state.window_size)
correlation_list = utils.get_correlation(self.state.pattern_center, self.state.pattern_model, data, self.state.window_size)
del_conv_list = []
delete_pattern_timestamp = []
for segment in deleted_segments:
del_mid_index = segment.center_index
delete_pattern_timestamp.append(segment.pattern_timestamp)
deleted_pat = utils.get_interval(data, del_mid_index, self.state.window_size)
deleted_pat = utils.subtract_min_without_nan(deleted_pat)
del_conv_pat = scipy.signal.fftconvolve(deleted_pat, self.state.pattern_model)
if len(del_conv_pat): del_conv_list.append(max(del_conv_pat))
self.state.convolve_min, self.state.convolve_max = utils.get_min_max(convolve_list, self.state.window_size / 3)
self.state.conv_del_min, self.state.conv_del_max = utils.get_min_max(del_conv_list, self.state.window_size)
def do_detect(self, dataframe: pd.DataFrame) -> TimeSeries:
data = utils.cut_dataframe(dataframe)
data = data['value']
pat_data = self.state.pattern_model
if pat_data.count(0) == len(pat_data):
raise ValueError('Labeled patterns must not be empty')
window_size = self.state.window_size
all_corr = utils.get_correlation_gen(data, window_size, pat_data)
all_corr_peaks = utils.find_peaks(all_corr, window_size * 2)
filtered = self.__filter_detection(all_corr_peaks, data)
filtered = list(filtered)
return [(item, item + window_size * 2) for item in filtered]
def __filter_detection(self, segments: Generator[int, None, None], data: pd.Series) -> Generator[int, None, None]:
if not self.state.pattern_center:
return []
window_size = self.state.window_size
pattern_model = self.state.pattern_model
for ind, val in segments:
watch_data = data[ind - window_size: ind + window_size + 1]
watch_data = utils.subtract_min_without_nan(watch_data)
convolve_segment = scipy.signal.fftconvolve(watch_data, pattern_model)
if len(convolve_segment) > 0:
watch_conv = max(convolve_segment)
else:
continue
if watch_conv < self.state.convolve_min * 0.8 or val < PEARSON_FACTOR:
continue
if watch_conv < self.state.conv_del_max * 1.02 and watch_conv > self.state.conv_del_min * 0.98:
continue
yield ind

9
analytics/models/jump_model.py

@ -0,0 +1,9 @@
from models import StairModel, ModelType, ExtremumType
class JumpModel(StairModel):
def get_model_type(self) -> ModelType:
return ModelType.JUMP
def get_extremum_type(self) -> ExtremumType:
return ExtremumType.MAX

230
analytics/models/model.py

@ -0,0 +1,230 @@
from analytic_types import AnalyticUnitId, ModelCache, TimeSeries
from analytic_types.segment import Segment
from analytic_types.learning_info import LearningInfo
import utils
import utils.meta
from abc import ABC, abstractmethod
from attrdict import AttrDict
from typing import Optional, List, Tuple
import pandas as pd
import math
import logging
from enum import Enum
class ModelType(Enum):
JUMP = 'jump'
DROP = 'drop'
PEAK = 'peak'
TROUGH = 'trough'
GENERAL = 'general'
class ExtremumType(Enum):
MAX = 'max'
MIN = 'min'
class AnalyticSegment(Segment):
'''
Segment with specific analytics fields used by models:
- `labeled` / `deleted` flags
- `from` / `to` / `center` indices
- `length`
- `data`
- etc
'''
def __init__(
self,
from_timestamp: int,
to_timestamp: int,
_id: str,
analytic_unit_id: str,
labeled: bool,
deleted: bool,
message: str,
dataframe: pd.DataFrame,
center_finder = None
):
super().__init__(
from_timestamp,
to_timestamp,
_id,
analytic_unit_id,
labeled,
deleted,
message
)
self.from_index = utils.timestamp_to_index(dataframe, pd.to_datetime(self.from_timestamp, unit='ms'))
self.to_index = utils.timestamp_to_index(dataframe, pd.to_datetime(self.to_timestamp, unit='ms'))
self.length = abs(self.to_index - self.from_index)
self.__percent_of_nans = 0
if callable(center_finder):
self.center_index = center_finder(dataframe, self.from_index, self.to_index)
self.pattern_timestamp = dataframe['timestamp'][self.center_index]
else:
self.center_index = self.from_index + math.ceil(self.length / 2)
self.pattern_timestamp = dataframe['timestamp'][self.center_index]
assert len(dataframe['value']) >= self.to_index + 1, \
'segment {}-{} out of dataframe length={}'.format(self.from_index, self.to_index + 1, len(dataframe['value']))
self.data = dataframe['value'][self.from_index: self.to_index + 1]
@property
def percent_of_nans(self):
if not self.__percent_of_nans:
self.__percent_of_nans = self.data.isnull().sum() / len(self.data)
return self.__percent_of_nans
def convert_nan_to_zero(self):
nan_list = utils.find_nan_indexes(self.data)
self.data = utils.nan_to_zero(self.data, nan_list)
@utils.meta.JSONClass
class ModelState():
def __init__(
self,
time_step: int = 0,
pattern_center: List[int] = None,
pattern_model: List[float] = None,
convolve_max: float = 0,
convolve_min: float = 0,
window_size: int = 0,
conv_del_min: float = 0,
conv_del_max: float = 0
):
self.time_step = time_step
self.pattern_center = pattern_center if pattern_center is not None else []
self.pattern_model = pattern_model if pattern_model is not None else []
self.convolve_max = convolve_max
self.convolve_min = convolve_min
self.window_size = window_size
self.conv_del_min = conv_del_min
self.conv_del_max = conv_del_max
class Model(ABC):
HEIGHT_ERROR = 0.1
CONV_ERROR = 0.2
DEL_CONV_ERROR = 0.02
@abstractmethod
def do_fit(
self,
dataframe: pd.DataFrame,
labeled_segments: List[AnalyticSegment],
deleted_segments: List[AnalyticSegment],
learning_info: LearningInfo
) -> None:
pass
@abstractmethod
def do_detect(self, dataframe: pd.DataFrame) -> TimeSeries:
pass
@abstractmethod
def find_segment_center(self, dataframe: pd.DataFrame, start: int, end: int) -> int:
pass
@abstractmethod
def get_model_type(self) -> ModelType:
pass
@abstractmethod
def get_state(self, cache: Optional[ModelCache] = None) -> ModelState:
pass
def fit(self, dataframe: pd.DataFrame, segments: List[Segment], id: AnalyticUnitId) -> ModelState:
logging.debug('Start method fit for analytic unit {}'.format(id))
data = dataframe['value']
max_length = 0
labeled = []
deleted = []
for segment_map in segments:
if segment_map.labeled or segment_map.deleted:
segment = AnalyticSegment(
segment_map.from_timestamp,
segment_map.to_timestamp,
segment_map._id,
segment_map.analytic_unit_id,
segment_map.labeled,
segment_map.deleted,
segment_map.message,
dataframe,
self.find_segment_center
)
if segment.percent_of_nans > 0.1 or len(segment.data) == 0:
logging.debug(f'segment {segment.from_index}-{segment.to_index} skip because of invalid data')
continue
if segment.percent_of_nans > 0:
segment.convert_nan_to_zero()
max_length = max(segment.length, max_length)
if segment.labeled: labeled.append(segment)
if segment.deleted: deleted.append(segment)
assert len(labeled) > 0, f'labeled list empty, skip fitting for {id}'
if self.state.window_size == 0:
self.state.window_size = math.ceil(max_length / 2) if max_length else 0
learning_info = self.get_parameters_from_segments(dataframe, labeled, deleted, self.get_model_type())
self.do_fit(dataframe, labeled, deleted, learning_info)
logging.debug('fit complete successful with self.state: {} for analytic unit: {}'.format(self.state, id))
return self.state
def detect(self, dataframe: pd.DataFrame, id: AnalyticUnitId) -> dict:
logging.debug('Start method detect for analytic unit {}'.format(id))
result = self.do_detect(dataframe)
segments = [(
utils.convert_pd_timestamp_to_ms(dataframe['timestamp'][x[0]]),
utils.convert_pd_timestamp_to_ms(dataframe['timestamp'][x[1]]),
) for x in result]
if not self.state:
logging.warning('Return empty self.state after detect')
logging.debug('Method detect complete successful for analytic unit {}'.format(id))
return {
'segments': segments,
'cache': self.state,
}
def _update_fitting_result(self, state: ModelState, confidences: list, convolve_list: list, del_conv_list: list, height_list: Optional[list] = None) -> None:
state.confidence = float(min(confidences, default = 1.5))
state.convolve_min, state.convolve_max = utils.get_min_max(convolve_list, state.window_size)
state.conv_del_min, state.conv_del_max = utils.get_min_max(del_conv_list, 0)
if height_list is not None:
state.height_min, state.height_max = utils.get_min_max(height_list, 0)
def get_parameters_from_segments(self, dataframe: pd.DataFrame, labeled: List[dict], deleted: List[dict], model: ModelType) -> dict:
logging.debug('Start parsing segments')
learning_info = LearningInfo()
data = dataframe['value']
for segment in labeled:
confidence = utils.find_confidence(segment.data)[0]
learning_info.confidence.append(confidence)
segment_center = segment.center_index
learning_info.segment_center_list.append(segment_center)
learning_info.pattern_timestamp.append(segment.pattern_timestamp)
aligned_segment = utils.get_interval(data, segment_center, self.state.window_size)
aligned_segment = utils.subtract_min_without_nan(aligned_segment)
if len(aligned_segment) == 0:
logging.warning('cant add segment to learning because segment is empty where segments center is: {}, window_size: {}, and len_data: {}'.format(
segment_center, self.state.window_size, len(data)))
continue
learning_info.patterns_list.append(aligned_segment)
# TODO: use Triangle/Stair types
if model == ModelType.PEAK or model == ModelType.TROUGH:
learning_info.pattern_height.append(utils.find_confidence(aligned_segment)[1])
learning_info.patterns_value.append(aligned_segment.values.max())
if model == ModelType.JUMP or model == ModelType.DROP:
pattern_height, pattern_length = utils.find_parameters(segment.data, segment.from_index, model.value)
learning_info.pattern_height.append(pattern_height)
learning_info.pattern_width.append(pattern_length)
learning_info.patterns_value.append(aligned_segment.values[self.state.window_size])
logging.debug('Parsing segments ended correctly with learning_info: {}'.format(learning_info))
return learning_info

44
analytics/models/peak_model.py

@ -0,0 +1,44 @@
from analytic_types import TimeSeries
from models import TriangleModel, ModelType
import utils
import scipy.signal
from scipy.signal import argrelextrema
from typing import Optional, List, Tuple
import numpy as np
import pandas as pd
class PeakModel(TriangleModel):
def get_model_type(self) -> ModelType:
return ModelType.PEAK
def find_segment_center(self, dataframe: pd.DataFrame, start: int, end: int) -> int:
data = dataframe['value']
segment = data[start: end]
return segment.idxmax()
def get_best_pattern(self, close_patterns: TimeSeries, data: pd.Series) -> List[int]:
pattern_list = []
for val in close_patterns:
max_val = data[val[0]]
ind = val[0]
for i in val:
if data[i] > max_val:
max_val = data[i]
ind = i
pattern_list.append(ind)
return pattern_list
def get_extremum_indexes(self, data: pd.Series) -> np.ndarray:
return argrelextrema(data.values, np.greater)[0]
def get_smoothed_data(self, data: pd.Series, confidence: float, alpha: float) -> pd.Series:
return utils.exponential_smoothing(data + self.state.confidence, alpha)
def get_possible_segments(self, data: pd.Series, smoothed_data: pd.Series, peak_indexes: List[int]) -> List[int]:
segments = []
for idx in peak_indexes:
if data[idx] > smoothed_data[idx]:
segments.append(idx)
return segments

147
analytics/models/stair_model.py

@ -0,0 +1,147 @@
from models import Model, ModelState, AnalyticSegment, ModelType
from analytic_types import TimeSeries
from analytic_types.learning_info import LearningInfo
from scipy.fftpack import fft
from typing import Optional, List
from enum import Enum
import scipy.signal
import utils
import utils.meta
import pandas as pd
import numpy as np
import operator
POSITIVE_SEGMENT_MEASUREMENT_ERROR = 0.2
NEGATIVE_SEGMENT_MEASUREMENT_ERROR = 0.02
@utils.meta.JSONClass
class StairModelState(ModelState):
def __init__(
self,
confidence: float = 0,
stair_height: float = 0,
stair_length: float = 0,
**kwargs
):
super().__init__(**kwargs)
self.confidence = confidence
self.stair_height = stair_height
self.stair_length = stair_length
class StairModel(Model):
def get_state(self, cache: Optional[dict] = None) -> StairModelState:
return StairModelState.from_json(cache)
def get_stair_indexes(self, data: pd.Series, height: float, length: int) -> List[int]:
"""Get list of start stair segment indexes.
Keyword arguments:
data -- data, that contains stair (jump or drop) segments
length -- maximum count of values in the stair
height -- the difference between stair max_line and min_line(see utils.find_parameters)
"""
indexes = []
for i in range(len(data) - length - 1):
is_stair = self.is_stair_in_segment(data.values[i:i + length + 1], height)
if is_stair == True:
indexes.append(i)
return indexes
def is_stair_in_segment(self, segment: np.ndarray, height: float) -> bool:
if len(segment) < 2:
return False
comparison_operator = operator.ge
if self.get_model_type() == ModelType.DROP:
comparison_operator = operator.le
height = -height
return comparison_operator(max(segment[1:]), segment[0] + height)
def find_segment_center(self, dataframe: pd.DataFrame, start: int, end: int) -> int:
data = dataframe['value']
segment = data[start: end]
segment_center_index = utils.find_pattern_center(segment, start, self.get_model_type().value)
return segment_center_index
def do_fit(
self,
dataframe: pd.DataFrame,
labeled_segments: List[AnalyticSegment],
deleted_segments: List[AnalyticSegment],
learning_info: LearningInfo
) -> None:
data = utils.cut_dataframe(dataframe)
data = data['value']
window_size = self.state.window_size
last_pattern_center = self.state.pattern_center
self.state.pattern_center = utils.remove_duplicates_and_sort(last_pattern_center + learning_info.segment_center_list)
self.state.pattern_model = utils.get_av_model(learning_info.patterns_list)
convolve_list = utils.get_convolve(self.state.pattern_center, self.state.pattern_model, data, window_size)
correlation_list = utils.get_correlation(self.state.pattern_center, self.state.pattern_model, data, window_size)
height_list = learning_info.patterns_value
del_conv_list = []
delete_pattern_timestamp = []
for segment in deleted_segments:
segment_cent_index = segment.center_index
delete_pattern_timestamp.append(segment.pattern_timestamp)
deleted_stair = utils.get_interval(data, segment_cent_index, window_size)
deleted_stair = utils.subtract_min_without_nan(deleted_stair)
del_conv_stair = scipy.signal.fftconvolve(deleted_stair, self.state.pattern_model)
if len(del_conv_stair) > 0:
del_conv_list.append(max(del_conv_stair))
self._update_fitting_result(self.state, learning_info.confidence, convolve_list, del_conv_list)
self.state.stair_height = int(min(learning_info.pattern_height, default = 1))
self.state.stair_length = int(max(learning_info.pattern_width, default = 1))
def do_detect(self, dataframe: pd.DataFrame) -> TimeSeries:
data = utils.cut_dataframe(dataframe)
data = data['value']
possible_stairs = self.get_stair_indexes(data, self.state.stair_height, self.state.stair_length + 1)
result = self.__filter_detection(possible_stairs, data)
return [(val - 1, val + 1) for val in result]
def __filter_detection(self, segments_indexes: List[int], data: list):
delete_list = []
variance_error = self.state.window_size
close_segments = utils.close_filtering(segments_indexes, variance_error)
segments_indexes = utils.best_pattern(close_segments, data, self.get_extremum_type().value)
if len(segments_indexes) == 0 or len(self.state.pattern_center) == 0:
return []
pattern_data = self.state.pattern_model
for segment_index in segments_indexes:
if segment_index <= self.state.window_size or segment_index >= (len(data) - self.state.window_size):
delete_list.append(segment_index)
continue
convol_data = utils.get_interval(data, segment_index, self.state.window_size)
percent_of_nans = convol_data.isnull().sum() / len(convol_data)
if len(convol_data) == 0 or percent_of_nans > 0.5:
delete_list.append(segment_index)
continue
elif 0 < percent_of_nans <= 0.5:
nan_list = utils.find_nan_indexes(convol_data)
convol_data = utils.nan_to_zero(convol_data, nan_list)
pattern_data = utils.nan_to_zero(pattern_data, nan_list)
conv = scipy.signal.fftconvolve(convol_data, pattern_data)
if len(conv) == 0:
delete_list.append(segment_index)
continue
upper_bound = self.state.convolve_max * (1 + POSITIVE_SEGMENT_MEASUREMENT_ERROR)
lower_bound = self.state.convolve_min * (1 - POSITIVE_SEGMENT_MEASUREMENT_ERROR)
delete_up_bound = self.state.conv_del_max * (1 + NEGATIVE_SEGMENT_MEASUREMENT_ERROR)
delete_low_bound = self.state.conv_del_min * (1 - NEGATIVE_SEGMENT_MEASUREMENT_ERROR)
max_conv = max(conv)
if max_conv > upper_bound or max_conv < lower_bound:
delete_list.append(segment_index)
elif max_conv < delete_up_bound and max_conv > delete_low_bound:
delete_list.append(segment_index)
for item in delete_list:
segments_indexes.remove(item)
segments_indexes = utils.remove_duplicates_and_sort(segments_indexes)
return segments_indexes

119
analytics/models/triangle_model.py

@ -0,0 +1,119 @@
from analytic_types import AnalyticUnitId, TimeSeries
from analytic_types.learning_info import LearningInfo
from models import Model, ModelState, AnalyticSegment
import utils
import utils.meta
import scipy.signal
from scipy.fftpack import fft
from typing import Optional, List, Tuple
import numpy as np
import pandas as pd
EXP_SMOOTHING_FACTOR = 0.01
@utils.meta.JSONClass
class TriangleModelState(ModelState):
def __init__(
self,
confidence: float = 0,
height_max: float = 0,
height_min: float = 0,
**kwargs
):
super().__init__(**kwargs)
self.confidence = confidence
self.height_max = height_max
self.height_min = height_min
class TriangleModel(Model):
def get_state(self, cache: Optional[dict] = None) -> TriangleModelState:
return TriangleModelState.from_json(cache)
def do_fit(
self,
dataframe: pd.DataFrame,
labeled_segments: List[AnalyticSegment],
deleted_segments: List[AnalyticSegment],
learning_info: LearningInfo
) -> None:
data = utils.cut_dataframe(dataframe)
data = data['value']
self.state.pattern_center = utils.remove_duplicates_and_sort(self.state.pattern_center + learning_info.segment_center_list)
self.state.pattern_model = utils.get_av_model(learning_info.patterns_list)
convolve_list = utils.get_convolve(self.state.pattern_center, self.state.pattern_model, data, self.state.window_size)
correlation_list = utils.get_correlation(self.state.pattern_center, self.state.pattern_model, data, self.state.window_size)
height_list = learning_info.patterns_value
del_conv_list = []
delete_pattern_width = []
delete_pattern_height = []
delete_pattern_timestamp = []
for segment in deleted_segments:
delete_pattern_timestamp.append(segment.pattern_timestamp)
deleted = utils.get_interval(data, segment.center_index, self.state.window_size)
deleted = utils.subtract_min_without_nan(deleted)
del_conv = scipy.signal.fftconvolve(deleted, self.state.pattern_model)
if len(del_conv):
del_conv_list.append(max(del_conv))
delete_pattern_height.append(utils.find_confidence(deleted)[1])
self._update_fitting_result(self.state, learning_info.confidence, convolve_list, del_conv_list, height_list)
def do_detect(self, dataframe: pd.DataFrame) -> TimeSeries:
data = utils.cut_dataframe(dataframe)
data = data['value']
all_extremum_indexes = self.get_extremum_indexes(data)
smoothed_data = self.get_smoothed_data(data, self.state.confidence, EXP_SMOOTHING_FACTOR)
segments = self.get_possible_segments(data, smoothed_data, all_extremum_indexes)
result = self.__filter_detection(segments, data)
result = utils.get_borders_of_peaks(result, data, self.state.window_size, self.state.confidence)
return result
def __filter_detection(self, segments: List[int], data: pd.Series) -> list:
delete_list = []
variance_error = self.state.window_size
close_patterns = utils.close_filtering(segments, variance_error)
segments = self.get_best_pattern(close_patterns, data)
if len(segments) == 0 or len(self.state.pattern_model) == 0:
return []
pattern_data = self.state.pattern_model
up_height = self.state.height_max * (1 + self.HEIGHT_ERROR)
low_height = self.state.height_min * (1 - self.HEIGHT_ERROR)
up_conv = self.state.convolve_max * (1 + 1.5 * self.CONV_ERROR)
low_conv = self.state.convolve_min * (1 - self.CONV_ERROR)
up_del_conv = self.state.conv_del_max * (1 + self.DEL_CONV_ERROR)
low_del_conv = self.state.conv_del_min * (1 - self.DEL_CONV_ERROR)
for segment in segments:
if segment > self.state.window_size:
convol_data = utils.get_interval(data, segment, self.state.window_size)
convol_data = utils.subtract_min_without_nan(convol_data)
percent_of_nans = convol_data.isnull().sum() / len(convol_data)
if percent_of_nans > 0.5:
delete_list.append(segment)
continue
elif 0 < percent_of_nans <= 0.5:
nan_list = utils.find_nan_indexes(convol_data)
convol_data = utils.nan_to_zero(convol_data, nan_list)
pattern_data = utils.nan_to_zero(pattern_data, nan_list)
conv = scipy.signal.fftconvolve(convol_data, pattern_data)
pattern_height = convol_data.values.max()
if pattern_height > up_height or pattern_height < low_height:
delete_list.append(segment)
continue
if max(conv) > up_conv or max(conv) < low_conv:
delete_list.append(segment)
continue
if max(conv) < up_del_conv and max(conv) > low_del_conv:
delete_list.append(segment)
else:
delete_list.append(segment)
for item in delete_list:
segments.remove(item)
return set(segments)

44
analytics/models/trough_model.py

@ -0,0 +1,44 @@
from analytic_types import TimeSeries
from models import TriangleModel, ModelType
import utils
import scipy.signal
from scipy.signal import argrelextrema
from typing import Optional, List, Tuple
import numpy as np
import pandas as pd
class TroughModel(TriangleModel):
def get_model_type(self) -> ModelType:
return ModelType.TROUGH
def find_segment_center(self, dataframe: pd.DataFrame, start: int, end: int) -> int:
data = dataframe['value']
segment = data[start: end]
return segment.idxmin()
def get_best_pattern(self, close_patterns: TimeSeries, data: pd.Series) -> List[int]:
pattern_list = []
for val in close_patterns:
min_val = data[val[0]]
ind = val[0]
for i in val:
if data[i] < min_val:
min_val = data[i]
ind = i
pattern_list.append(ind)
return pattern_list
def get_extremum_indexes(self, data: pd.Series) -> np.ndarray:
return argrelextrema(data.values, np.less)[0]
def get_smoothed_data(self, data: pd.Series, confidence: float, alpha: float) -> pd.Series:
return utils.exponential_smoothing(data - self.state.confidence, alpha)
def get_possible_segments(self, data: pd.Series, smoothed_data: pd.Series, trough_indexes: List[int]) -> List[int]:
segments = []
for idx in trough_indexes:
if data[idx] < smoothed_data[idx]:
segments.append(idx)
return segments

94
analytics/server.py

@ -0,0 +1,94 @@
#!/usr/bin/env python3
import sys
import os
import config
import json
import logging
import asyncio
import traceback
import services
from analytic_unit_manager import AnalyticUnitManager
server_service: services.ServerService = None
data_service: services.DataService = None
analytic_unit_manager: AnalyticUnitManager = None
logger = logging.getLogger('SERVER')
async def handle_task(task: object):
try:
task_type = task['type']
logger.info("Got {} task with id {}, analyticUnitId {}".format(task_type, task['_id'], task['analyticUnitId']))
task_result_payload = {
'_id': task['_id'],
'task': task_type,
'analyticUnitId': task['analyticUnitId'],
'status': "IN_PROGRESS"
}
if not task_type == 'PUSH':
message = services.server_service.ServerMessage('TASK_RESULT', task_result_payload)
await server_service.send_message_to_server(message)
res = await analytic_unit_manager.handle_analytic_task(task)
res['_id'] = task['_id']
if not task_type == 'PUSH':
message = services.server_service.ServerMessage('TASK_RESULT', res)
await server_service.send_message_to_server(message)
except Exception as e:
error_text = traceback.format_exc()
logger.error("handle_task Exception: '%s'" % error_text)
async def handle_data(task: object):
res = await analytic_unit_manager.handle_analytic_task(task)
if res['status'] == 'SUCCESS' and res['payload'] is not None:
res['_id'] = task['_id']
message = services.server_service.ServerMessage('PUSH_DETECT', res)
await server_service.send_message_to_server(message)
async def handle_message(message: services.ServerMessage):
if message.method == 'TASK':
await handle_task(message.payload)
if message.method == 'DATA':
await handle_data(message.payload)
def init_services():
global server_service
global data_service
global analytic_unit_manager
logger.info("Starting services...")
logger.info("Server...")
server_service = services.ServerService()
logger.info("Ok")
logger.info("Data service...")
data_service = services.DataService(server_service)
logger.info("Ok")
logger.info("Analytic unit manager...")
analytic_unit_manager = AnalyticUnitManager()
logger.info("Ok")
async def app_loop():
async for message in server_service:
asyncio.ensure_future(handle_message(message))
def run_server():
loop = asyncio.get_event_loop()
#loop.set_debug(True)
logger.info("Ok")
init_services()
print('Analytics process is running') # we need to print to stdout and flush
sys.stdout.flush() # because node.js expects it
loop.run_until_complete(app_loop())

2
analytics/services/__init__.py

@ -0,0 +1,2 @@
from services.server_service import ServerService, ServerMessage
from services.data_service import DataService

85
analytics/services/data_service.py

@ -0,0 +1,85 @@
from services.server_service import ServerMessage, ServerService
import json
import asyncio
"""
This is how you can save a file:
async def test_file_save():
async with data_service.open('filename') as f:
print('write content')
await f.write('test string')
async with data_service.open('filename') as f:
content = await f.load()
print(content)
print('test file ok')
"""
LOCK_WAIT_SLEEP_TIMESPAN = 100 # mc
class FileDescriptor:
def __init__(self, filename: str, data_service):
self.filename = filename
self.data_service = data_service
async def write(self, content: str):
await self.data_service.save_file_content(self, content)
async def load(self) -> str:
return await self.data_service.load_file_content(self)
async def __aenter__(self):
await self.data_service.wait_and_lock(self)
return self
async def __aexit__(self, *exc):
await self.data_service.unlock(self)
class DataService:
def __init__(self, server_service: ServerService):
"""Creates fs over network via server_service"""
self.server_service = server_service
self.locks = set()
def open(self, filename: str) -> FileDescriptor:
return FileDescriptor(filename, self)
async def wait_and_lock(self, file_descriptor: FileDescriptor):
filename = file_descriptor.filename
while True:
if filename in self.locks:
asyncio.sleep(LOCK_WAIT_SLEEP_TIMESPAN)
continue
else:
self.locks.add(filename)
break
async def unlock(self, file_descriptor: FileDescriptor):
filename = file_descriptor.filename
self.locks.remove(filename)
async def save_file_content(self, file_descriptor: FileDescriptor, content: str):
""" Saves json - serializable obj with file_descriptor.filename """
self.__check_lock(file_descriptor)
message_payload = {
'filename': file_descriptor.filename,
'content': content
}
message = ServerMessage('FILE_SAVE', message_payload)
await self.server_service.send_request_to_server(message)
async def load_file_content(self, file_descriptor: FileDescriptor) -> str:
self.__check_lock(file_descriptor)
message_payload = { 'filename': file_descriptor.filename }
message = ServerMessage('FILE_LOAD', message_payload)
return await self.server_service.send_request_to_server(message)
def __check_lock(self, file_descriptor: FileDescriptor):
filename = file_descriptor.filename
if filename not in self.locks:
raise RuntimeError('No lock for file %s' % filename)

149
analytics/services/server_service.py

@ -0,0 +1,149 @@
import config
import websockets
import logging
import json
import asyncio
import traceback
import utils.concurrent
import utils.meta
from typing import Optional
logger = logging.getLogger('SERVER_SERVICE')
PARSE_MESSAGE_OR_SAVE_LOOP_INTERRUPTED = False
SERVER_SOCKET_RECV_LOOP_INTERRUPTED = False
@utils.meta.JSONClass
class ServerMessage:
def __init__(self, method: str, payload: object = None, request_id: int = None):
# TODO: add error type / case
self.method = method
self.payload = payload
self.request_id = request_id
class ServerService(utils.concurrent.AsyncZmqActor):
def __init__(self):
super(ServerService, self).__init__()
self.__aiter_inited = False
# this typing doesn't help vscode, maybe there is a mistake
self.__server_socket: Optional[websockets.Connect] = None
self.__request_next_id = 1
self.__reconnecting = False
self.__responses = dict()
self.start()
async def send_message_to_server(self, message: ServerMessage):
# Following message will be sent to actor's self._on_message()
# We do it cuz we created self.__server_socket in self._run() method,
# which runs in the actor's thread, not the thread we created ServerService
# in theory, we can try to use zmq.proxy:
# zmq.proxy(self.__actor_socket, self.__server_socket)
# and do here something like:
# self.__actor_socket.send_string(json.dumps(message.to_json()))
await self._put_message_to_thread(json.dumps(message.to_json()))
async def send_request_to_server(self, message: ServerMessage) -> object:
if message.request_id is not None:
raise ValueError('Message can`t have request_id before it is scheduled')
request_id = message.request_id = self.__request_next_id
self.request_next_id = self.__request_next_id + 1
asyncio.ensure_future(self.send_message_to_server(message))
# you should await self.__responses[request_id] which should be a task,
# which you resolve somewhere else
while request_id not in self.__responses:
await asyncio.sleep(1)
response = self.__responses[request_id]
del self.__responses[request_id]
return response
def __aiter__(self):
if self.__aiter_inited:
raise RuntimeError('Can`t iterate twice')
__aiter_inited = True
return self
async def __anext__(self) -> ServerMessage:
while not PARSE_MESSAGE_OR_SAVE_LOOP_INTERRUPTED:
thread_message = await self._recv_message_from_thread()
server_message = self.__parse_message_or_save(thread_message)
if server_message is None:
continue
else:
return server_message
async def _run_thread(self):
logger.info("Binding to %s ..." % config.HASTIC_SERVER_URL)
# TODO: consider to use async context for socket
await self.__server_socket_recv_loop()
async def _on_message_to_thread(self, message: str):
if self.__server_socket is None or self.__server_socket.closed:
await self.__reconnect()
await self.__server_socket.send(message)
async def __server_socket_recv_loop(self):
while not SERVER_SOCKET_RECV_LOOP_INTERRUPTED:
received_string = await self.__reconnect_recv()
if received_string == 'PING':
asyncio.ensure_future(self.__handle_ping())
else:
asyncio.ensure_future(self._send_message_from_thread(received_string))
async def __reconnect(self):
if not self.__reconnecting:
self.__reconnecting = True
else:
while self.__reconnecting:
await asyncio.sleep(1)
return
if not self.__server_socket is None:
await self.__server_socket.close()
self.__server_socket = await websockets.connect(config.HASTIC_SERVER_URL)
first_message = await self.__server_socket.recv()
if first_message == 'EALREADYEXISTING':
raise ConnectionError('Can`t connect as a second analytics')
self.__reconnecting = False
async def __reconnect_recv(self) -> str:
while not SERVER_SOCKET_RECV_LOOP_INTERRUPTED:
try:
if self.__server_socket is None or self.__server_socket.closed:
await self.__reconnect()
return await self.__server_socket.recv()
except (ConnectionRefusedError, websockets.ConnectionClosedError):
if not self.__server_socket is None:
await self.__server_socket.close()
# TODO: this logic increases the number of ThreadPoolExecutor
self.__server_socket = None
# TODO: move to config
reconnect_delay = 3
print('connection is refused or lost, trying to reconnect in %s seconds' % reconnect_delay)
await asyncio.sleep(reconnect_delay)
raise InterruptedError()
async def __handle_ping(self):
if self.__server_socket is None or self.__server_socket.closed:
await self.__reconnect()
await self.__server_socket.send('PONG')
def __parse_message_or_save(self, text: str) -> Optional[ServerMessage]:
try:
message_object = json.loads(text)
message = ServerMessage.from_json(message_object)
if message.request_id is not None:
self.__responses[message_object['requestId']] = message.payload
return None
return message
except Exception:
error_text = traceback.format_exc()
logger.error("__handle_message Exception: '%s'" % error_text)

4
analytics/utils/__init__.py

@ -0,0 +1,4 @@
from utils.common import *
from utils.time import *
from utils.dataframe import *
from utils.meta import *

443
analytics/utils/common.py

@ -0,0 +1,443 @@
import numpy as np
import pandas as pd
import scipy.signal
from scipy.fftpack import fft
from scipy.signal import argrelextrema
from scipy.stats import gaussian_kde
from scipy.stats.stats import pearsonr
import math
from typing import Optional, Union, List, Generator, Tuple
import utils
import logging
from itertools import islice
from collections import deque
from analytic_types import TimeSeries
from analytic_types.segment import Segment
SHIFT_FACTOR = 0.05
CONFIDENCE_FACTOR = 0.5
SMOOTHING_FACTOR = 5
MEASUREMENT_ERROR = 0.05
def exponential_smoothing(series: pd.Series, alpha: float, last_smoothed_value: Optional[float] = None) -> pd.Series:
if alpha < 0 or alpha > 1:
raise ValueError('Alpha must be within the boundaries: 0 <= alpha <= 1')
if len(series) < 2:
return series
if last_smoothed_value is None:
result = [series.values[0]]
else:
result = [float(last_smoothed_value)]
if np.isnan(result):
result = [0]
for n in range(1, len(series)):
if np.isnan(series[n]):
result.append((1 - alpha) * result[n - 1])
series.values[n] = result[n]
else:
result.append(alpha * series[n] + (1 - alpha) * result[n - 1])
assert len(result) == len(series), \
f'len of smoothed data {len(result)} != len of original dataset {len(series)}'
return pd.Series(result, index = series.index)
def find_pattern(data: pd.Series, height: float, length: int, pattern_type: str) -> list:
pattern_list = []
right_bound = len(data) - length - 1
for i in range(right_bound):
for x in range(1, length):
if pattern_type == 'jump':
if(data[i + x] > data[i] + height):
pattern_list.append(i)
elif pattern_type == 'drop':
if(data[i + x] < data[i] - height):
pattern_list.append(i)
return pattern_list
def timestamp_to_index(dataframe: pd.DataFrame, timestamp: int):
data = dataframe['timestamp']
idx, = np.where(data >= timestamp)
if len(idx) > 0:
time_ind = int(idx[0])
else:
raise ValueError('Dataframe doesn`t contain timestamp: {}'.format(timestamp))
return time_ind
def find_peaks(data: Generator[float, None, None], size: int) -> Generator[float, None, None]:
window = deque(islice(data, size * 2 + 1))
for i, v in enumerate(data, size):
current = window[size]
#TODO: remove max() from loop
if current == max(window) and current != window[size + 1]:
yield i, current
window.append(v)
window.popleft()
def ar_mean(numbers: List[float]):
return float(sum(numbers)) / max(len(numbers), 1)
def get_av_model(patterns_list: list):
if not patterns_list: return []
patterns_list = get_same_length(patterns_list)
value_list = list(map(list, zip(*patterns_list)))
return list(map(ar_mean, value_list))
def get_same_length(patterns_list: list):
for index in range(len(patterns_list)):
if type(patterns_list[index]) == pd.Series:
patterns_list[index] = patterns_list[index].tolist()
patterns_list = list(filter(None, patterns_list))
max_length = max(map(len, patterns_list))
for pat in patterns_list:
if len(pat) < max_length:
length_difference = max_length - len(pat)
added_values = list(0 for _ in range(length_difference))
pat.extend(added_values)
return patterns_list
def close_filtering(pattern_list: List[int], win_size: int) -> TimeSeries:
if len(pattern_list) == 0:
return []
s = [[pattern_list[0]]]
k = 0
for i in range(1, len(pattern_list)):
if pattern_list[i] - win_size <= s[k][-1]:
s[k].append(pattern_list[i])
else:
k += 1
s.append([pattern_list[i]])
return s
def merge_intersecting_segments(segments: List[Segment], time_step: int) -> List[Segment]:
'''
Find intersecting segments in segments list and merge it.
'''
if len(segments) < 2:
return segments
segments = sorted(segments, key = lambda segment: segment.from_timestamp)
previous_segment = segments[0]
for i in range(1, len(segments)):
if segments[i].from_timestamp <= previous_segment.to_timestamp + time_step:
segments[i].message = segments[-1].message
segments[i].from_timestamp = min(previous_segment.from_timestamp, segments[i].from_timestamp)
segments[i].to_timestamp = max(previous_segment.to_timestamp, segments[i].to_timestamp)
segments[i - 1] = None
previous_segment = segments[i]
segments = [x for x in segments if x is not None]
return segments
def find_interval(dataframe: pd.DataFrame) -> int:
if len(dataframe) < 2:
raise ValueError('Can`t find interval: length of data must be at least 2')
delta = utils.convert_pd_timestamp_to_ms(dataframe.timestamp[1]) - utils.convert_pd_timestamp_to_ms(dataframe.timestamp[0])
return delta
def get_start_and_end_of_segments(segments: List[List[int]]) -> TimeSeries:
'''
find start and end of segment: [1, 2, 3, 4] -> [1, 4]
if segment is 1 index - it will be doubled: [7] -> [7, 7]
'''
result = []
for segment in segments:
if len(segment) == 0:
continue
elif len(segment) > 1:
segment = [segment[0], segment[-1]]
else:
segment = [segment[0], segment[0]]
result.append(segment)
return result
def best_pattern(pattern_list: list, data: pd.Series, dir: str) -> list:
new_pattern_list = []
for val in pattern_list:
max_val = data[val[0]]
min_val = data[val[0]]
ind = val[0]
for i in val:
if dir == 'max':
if data[i] > max_val:
max_val = data[i]
ind = i
else:
if data[i] < min_val:
min_val = data[i]
ind = i
new_pattern_list.append(ind)
return new_pattern_list
def find_nan_indexes(segment: pd.Series) -> list:
nan_list = pd.isnull(segment)
nan_list = np.array(nan_list)
nan_indexes = np.where(nan_list == True)[0]
return list(nan_indexes)
def check_nan_values(segment: Union[pd.Series, list]) -> Union[pd.Series, list]:
nan_list = utils.find_nan_indexes(segment)
if len(nan_list) > 0:
segment = utils.nan_to_zero(segment, nan_list)
return segment
def nan_to_zero(segment: Union[pd.Series, list], nan_list: list) -> Union[pd.Series, list]:
if type(segment) == pd.Series:
for val in nan_list:
segment.values[val] = 0
else:
for val in nan_list:
segment[val] = 0
return segment
def find_confidence(segment: pd.Series) -> (float, float):
segment = utils.check_nan_values(segment)
segment_min = min(segment)
segment_max = max(segment)
height = segment_max - segment_min
if height:
return (CONFIDENCE_FACTOR * height, height)
else:
return (0, 0)
def find_width(pattern: pd.Series, selector: bool) -> int:
pattern = pattern.values
center = utils.find_extremum_index(pattern, selector)
pattern_left = pattern[:center]
pattern_right = pattern[center:]
left_extremum_index = utils.find_last_extremum(pattern_left, selector)
right_extremum_index = utils.find_extremum_index(pattern_right, not selector)
left_width = center - left_extremum_index
right_width = right_extremum_index + 1
return right_width + left_width
def find_last_extremum(segment: np.ndarray, selector: bool) -> int:
segment = segment[::-1]
first_extremum_ind = find_extremum_index(segment, not selector)
last_extremum_ind = len(segment) - first_extremum_ind - 1
return last_extremum_ind
def find_extremum_index(segment: np.ndarray, selector: bool) -> int:
if selector:
return segment.argmax()
else:
return segment.argmin()
def get_interval(data: pd.Series, center: int, window_size: int, normalization = False) -> pd.Series:
"""
Get an interval with 2*window_size length
window_size to the left, window_size to the right of center
If normalization == True - subtract minimum from the interval
"""
if center >= len(data):
logging.warning('Pattern center {} is out of data with len {}'.format(center, len(data)))
return []
left_bound = center - window_size
right_bound = center + window_size + 1
if left_bound < 0:
left_bound = 0
if right_bound > len(data):
right_bound = len(data)
result_interval = data[left_bound: right_bound]
if normalization:
result_interval = subtract_min_without_nan(result_interval)
return result_interval
def get_borders_of_peaks(pattern_centers: List[int], data: pd.Series, window_size: int, confidence: float, max_border_factor = 1.0, inverse = False) -> TimeSeries:
"""
Find start and end of patterns for peak
max_border_factor - final border of pattern
if reverse == True - segments will be inversed (trough -> peak / peak -> trough)
"""
if len(pattern_centers) == 0:
return []
border_list = []
window_size = math.ceil(max_border_factor * window_size)
for center in pattern_centers:
current_pattern = get_interval(data, center, window_size, True)
if inverse:
current_pattern = inverse_segment(current_pattern)
current_pattern = current_pattern - confidence
left_segment = current_pattern[:window_size] # a.iloc[a.index < center]
right_segment = current_pattern[window_size:] # a.iloc[a.index >= center]
left_border = get_end_of_segment(left_segment, descending = False)
right_border = get_end_of_segment(right_segment)
border_list.append((left_border, right_border))
return border_list
def get_end_of_segment(segment: pd.Series, skip_positive_values = True, descending = True) -> int:
"""
Find end of descending or ascending part of pattern
Allowable error is 1 index
"""
if not descending:
segment = segment.iloc[::-1]
if len(segment) == 0:
return 1
for idx in range(1, len(segment) - 1):
if skip_positive_values and segment.values[idx] > 0:
continue
if segment.values[idx] >= segment.values[idx - 1]:
return segment.index[idx - 1]
return segment.index[-1]
def inverse_segment(segment: pd.Series) -> pd.Series:
"""
Сonvert trough to peak and virce versa
"""
if len(segment) > 0:
rev_val = max(segment.values)
for idx in range(len(segment)):
segment.values[idx] = math.fabs(segment.values[idx] - rev_val)
return segment
def subtract_min_without_nan(segment: pd.Series) -> pd.Series:
if len(segment) == 0:
return []
nan_list = utils.find_nan_indexes(segment)
if len(nan_list) > 0:
return segment
else:
segment = segment - min(segment)
return segment
def get_convolve(segments: list, av_model: list, data: pd.Series, window_size: int) -> list:
labeled_segment = []
convolve_list = []
for segment in segments:
labeled_segment = utils.get_interval(data, segment, window_size)
labeled_segment = utils.subtract_min_without_nan(labeled_segment)
labeled_segment = utils.check_nan_values(labeled_segment)
auto_convolve = scipy.signal.fftconvolve(labeled_segment, labeled_segment)
convolve_segment = scipy.signal.fftconvolve(labeled_segment, av_model)
if len(auto_convolve) > 0:
convolve_list.append(max(auto_convolve))
if len(convolve_segment) > 0:
convolve_list.append(max(convolve_segment))
return convolve_list
def get_correlation_gen(data: pd.Series, window_size: int, pattern_model: List[float]) -> Generator[float, None, None]:
#Get a new dataset by correlating between a sliding window in data and pattern_model
for i in range(window_size, len(data) - window_size):
watch_data = data[i - window_size: i + window_size + 1]
correlation = pearsonr(watch_data, pattern_model)
if len(correlation) > 0:
yield(correlation[0])
def get_correlation(segments: list, av_model: list, data: pd.Series, window_size: int) -> list:
labeled_segment = []
correlation_list = []
p_value_list = []
for segment in segments:
labeled_segment = utils.get_interval(data, segment, window_size)
labeled_segment = utils.subtract_min_without_nan(labeled_segment)
labeled_segment = utils.check_nan_values(labeled_segment)
if len(labeled_segment) == 0 or len(labeled_segment) != len(av_model):
continue
correlation = pearsonr(labeled_segment, av_model)
if len(correlation) > 1:
correlation_list.append(correlation[0])
p_value_list.append(correlation[1])
return correlation_list
def get_distribution_density(segment: pd.Series) -> float:
segment.dropna(inplace = True)
if len(segment) < 2 or len(segment.nonzero()[0]) == 0:
return (0, 0, 0)
min_jump = min(segment)
max_jump = max(segment)
pdf = gaussian_kde(segment)
x = np.linspace(segment.min() - 1, segment.max() + 1, len(segment))
y = pdf(x)
ax_list = list(zip(x, y))
ax_list = np.array(ax_list, np.float32)
antipeaks_kde = argrelextrema(np.array(ax_list), np.less)[0]
peaks_kde = argrelextrema(np.array(ax_list), np.greater)[0]
try:
min_peak_index = peaks_kde[0]
segment_min_line = ax_list[min_peak_index, 0]
max_peak_index = peaks_kde[1]
segment_max_line = ax_list[max_peak_index, 0]
segment_median = ax_list[antipeaks_kde[0], 0]
except IndexError:
segment_max_line = max_jump * (1 - SHIFT_FACTOR)
segment_min_line = min_jump * (1 - SHIFT_FACTOR)
segment_median = (max_jump - min_jump) / 2 + min_jump
return segment_median, segment_max_line, segment_min_line
def find_parameters(segment_data: pd.Series, segment_from_index: int, pat_type: str) -> [int, float, int]:
segment = segment_data
if len(segment_data) > SMOOTHING_FACTOR * 3:
flat_segment = segment_data.rolling(window = SMOOTHING_FACTOR).mean()
segment = flat_segment.dropna()
segment_median, segment_max_line, segment_min_line = utils.get_distribution_density(segment)
height = 0.95 * (segment_max_line - segment_min_line)
length = utils.get_pattern_length(segment_data, segment_min_line, segment_max_line, pat_type)
return height, length
def find_pattern_center(segment_data: pd.Series, segment_from_index: int, pattern_type: str):
segment_median = utils.get_distribution_density(segment_data)[0]
cen_ind = utils.pattern_intersection(segment_data.tolist(), segment_median, pattern_type)
if len(cen_ind) > 0:
pat_center = cen_ind[0]
segment_cent_index = pat_center + segment_from_index
else:
segment_cent_index = math.ceil((len(segment_data)) / 2)
return segment_cent_index
def get_pattern_length(segment_data: pd.Series, segment_min_line: float, segment_max_line: float, pat_type: str) -> int:
# TODO: move function to jump & drop merged model
segment_max = max(segment_data)
segment_min = min(segment_data)
# TODO: use better way
if segment_min_line <= segment_min:
segment_min_line = segment_min * (1 + MEASUREMENT_ERROR)
if segment_max_line >= segment_max:
segment_max_line = segment_max * (1 - MEASUREMENT_ERROR)
min_line = []
max_line = []
for i in range(len(segment_data)):
min_line.append(segment_min_line)
max_line.append(segment_max_line)
min_line = np.array(min_line)
max_line = np.array(max_line)
segment_array = np.array(segment_data.tolist())
idmin = np.argwhere(np.diff(np.sign(min_line - segment_array)) != 0).reshape(-1)
idmax = np.argwhere(np.diff(np.sign(max_line - segment_array)) != 0).reshape(-1)
if len(idmin) > 0 and len(idmax) > 0:
if pat_type == 'jump':
result_length = idmax[0] - idmin[-1] + 1
elif pat_type == 'drop':
result_length = idmin[0] - idmax[-1] + 1
return result_length if result_length > 0 else 0
else:
return 0
def pattern_intersection(segment_data: list, median: float, pattern_type: str) -> list:
center_index = []
if pattern_type == 'jump':
for i in range(1, len(segment_data) - 1):
if segment_data[i - 1] < median and segment_data[i + 1] > median:
center_index.append(i)
elif pattern_type == 'drop':
for i in range(1, len(segment_data) - 1):
if segment_data[i - 1] > median and segment_data[i + 1] < median:
center_index.append(i)
delete_index = []
for i in range(1, len(center_index)):
if center_index[i] == center_index[i - 1] + 1:
delete_index.append(i - 1)
return [x for (idx, x) in enumerate(center_index) if idx not in delete_index]
def cut_dataframe(data: pd.DataFrame) -> pd.DataFrame:
data_min = data['value'].min()
if not np.isnan(data_min) and data_min > 0:
data['value'] = data['value'] - data_min
return data
def get_min_max(array: list, default):
return float(min(array, default=default)), float(max(array, default=default))
def remove_duplicates_and_sort(array: list) -> list:
array = list(frozenset(array))
array.sort()
return array

130
analytics/utils/concurrent.py

@ -0,0 +1,130 @@
import asyncio
import threading
import zmq
import zmq.asyncio
from abc import ABC, abstractmethod
# This const defines Thread <-> Actor zmq one-to-one connection
# We create a seperate zmq context, so zqm address 'inproc://xxx' doesn't matter
# It is default address and you may want to use AsyncZmqThread another way
ZMQ_THREAD_ACTOR_ADDR = 'inproc://xxx'
# Inherience order (threading.Thread, ABC) is essential. Otherwise it's a MRO error.
class AsyncZmqThread(threading.Thread, ABC):
"""Class for wrapping zmq socket into a thread with it's own asyncio event loop
"""
def __init__(self,
zmq_context: zmq.asyncio.Context,
zmq_socket_addr: str,
zmq_socket_type = zmq.PAIR
):
super(AsyncZmqThread, self).__init__()
self._zmq_context = zmq_context # you can use it in child classes
self.__zmq_socket_addr = zmq_socket_addr
self.__zmq_socket_type = zmq_socket_type
self.__asyncio_loop = None
self.__zmq_socket = None
async def __message_recv_loop(self):
while True:
text = await self.__zmq_socket.recv_string()
asyncio.ensure_future(self._on_message_to_thread(text))
async def _send_message_from_thread(self, message: str):
await self.__zmq_socket.send_string(message)
@abstractmethod
async def _on_message_to_thread(self, message: str):
"""Override this method to receive messages"""
@abstractmethod
async def _run_thread(self):
"""Override this method to do some async work.
This method uses a separate thread.
You can block yourself here if you don't do any await.
Example:
```
async def _run_thread(self):
i = 0
while True:
await asyncio.sleep(1)
i += 1
await self._send_message_from_thread(f'{self.name}: ping {i}')
```
"""
def run(self):
self.__asyncio_loop = asyncio.new_event_loop()
asyncio.set_event_loop(self.__asyncio_loop)
self.__zmq_socket = self._zmq_context.socket(self.__zmq_socket_type)
self.__zmq_socket.connect(self.__zmq_socket_addr)
asyncio.ensure_future(self.__message_recv_loop())
self.__asyncio_loop.run_until_complete(self._run_thread())
# TODO: implement stop signal handling
class AsyncZmqActor(AsyncZmqThread):
"""Threaded and Async Actor model based on ZMQ inproc communication
override following:
```
async def _run_thread(self)
async def _on_message_to_thread(self, message: str)
```
both methods run in actor's thread
you can call `self._send_message_from_thread('txt')`
to receive it later in `self._recv_message_from_thread()`.
Example:
```
class MyActor(AsyncZmqActor):
async def _run_thread(self):
self.counter = 0
# runs in a different thread
await self._send_message_from_thread('some_txt_message_to_actor')
def async _on_message_to_thread(self, message):
# runs in Thread-actor
self.counter++
asyncZmqActor = MyActor()
asyncZmqActor.start()
```
"""
def __init__(self):
super(AsyncZmqActor, self).__init__(zmq.asyncio.Context(), ZMQ_THREAD_ACTOR_ADDR)
self.__actor_socket = self._zmq_context.socket(zmq.PAIR)
self.__actor_socket.bind(ZMQ_THREAD_ACTOR_ADDR)
async def _put_message_to_thread(self, message: str):
"""It "sends" `message` to thread,
but we can't await it's `AsyncZmqThread._on_message_to_thread()`
so it's "put", not "send"
"""
await self.__actor_socket.send_string(message)
async def _recv_message_from_thread(self) -> str:
"""Returns next message ``'txt'`` from thread sent by
``AsyncZmqActor._send_message_from_thread('txt')``
"""
return await self.__actor_socket.recv_string()
# TODO: implement graceful stopping

63
analytics/utils/dataframe.py

@ -0,0 +1,63 @@
from itertools import chain
import pandas as pd
import numpy as np
from typing import Generator
def prepare_data(data: list) -> pd.DataFrame:
"""
Takes list
- converts it into pd.DataFrame,
- converts 'timestamp' column to pd.Datetime,
- subtracts min value from the dataset
"""
data = pd.DataFrame(data, columns=['timestamp', 'value'])
data['timestamp'] = pd.to_datetime(data['timestamp'], unit='ms')
data.fillna(value = np.nan, inplace = True)
return data
def get_intersected_chunks(data: list, intersection: int, chunk_size: int) -> Generator[list, None, None]:
"""
Returns generator that splits dataframe on intersected segments.
Intersection makes it able to detect pattern that present in dataframe on the border between chunks.
intersection - length of intersection.
chunk_size - length of chunk
"""
assert chunk_size > 0, 'chunk size must be great than zero'
assert intersection > 0, 'intersection length must be great than zero'
data_len = len(data)
if data_len <= chunk_size:
yield data
return
nonintersected = chunk_size - intersection
offset = 0
while True:
left_values = data_len - offset
if left_values == 0:
break
if left_values <= chunk_size:
yield data[offset : data_len]
break
else:
yield data[offset: offset + chunk_size]
offset += min(nonintersected, left_values)
def get_chunks(data: list, chunk_size: int) -> Generator[list, None, None]:
"""
Returns generator that splits dataframe on non-intersected segments.
chunk_size - length of chunk
"""
assert chunk_size > 0, 'chunk size must be great than zero'
chunks_iterables = [iter(data)] * chunk_size
result_chunks = zip(*chunks_iterables)
partial_chunk_len = len(data) % chunk_size
if partial_chunk_len != 0:
result_chunks = chain(result_chunks, [data[-partial_chunk_len:]])
for chunk in result_chunks:
yield list(chunk)

81
analytics/utils/meta.py

@ -0,0 +1,81 @@
from inspect import signature, Parameter
from functools import wraps
from typing import Optional, List
import re
CAMEL_REGEX = re.compile(r'([A-Z])')
UNDERSCORE_REGEX = re.compile(r'_([a-z])')
def camel_to_underscore(name):
#TODO: need to rename 'from'/'to' to 'from_timestamp'/'to_timestamp' everywhere(in analytics, server, panel)
if name == 'from' or name == 'to':
name += '_timestamp'
return CAMEL_REGEX.sub(lambda x: '_' + x.group(1).lower(), name)
def underscore_to_camel(name):
if name == 'from_timestamp' or name == 'to_timestamp':
name = name.replace('_timestamp', '')
return UNDERSCORE_REGEX.sub(lambda x: x.group(1).upper(), name)
def is_field_private(field_name: str) -> Optional[str]:
m = re.match(r'_[^(__)]+__', field_name)
return m is not None
def serialize(obj):
if hasattr(obj, 'to_json') == True:
return obj.to_json()
else:
return obj
def inited_params(target_init):
target_params = signature(target_init).parameters.values()
if len(target_params) < 1:
raise ValueError('init function mush have at least self parameter')
if len(target_params) == 1:
return target_init
_, *target_params = target_params # we will not use self any more
@wraps(target_init)
def wrapped_init(wrapped_self, *wrapped_args, **wrapped_kwargs):
for tp in target_params:
if tp.default is Parameter.empty:
continue
setattr(wrapped_self, tp.name, tp.default)
for tp, v in zip(target_params, wrapped_args):
setattr(wrapped_self, tp.name, v)
for k, v in wrapped_kwargs.items():
setattr(wrapped_self, k, v)
target_init(wrapped_self, *wrapped_args, **wrapped_kwargs)
return wrapped_init
def JSONClass(target_class):
def to_json(self) -> dict:
"""
returns a json representation of the class
where all None - values and private fileds are skipped
"""
return {
underscore_to_camel(k): serialize(v) for k, v in self.__dict__.items()
if v is not None and not is_field_private(k)
}
def from_json(json_object: Optional[dict]) -> target_class:
if json_object is None:
json_object = {}
init_object = { camel_to_underscore(k): v for k, v in json_object.items() }
return target_class(**init_object)
# target_class.__init__ = inited_params(target_class.__init__)
target_class.to_json = to_json
target_class.from_json = from_json
return target_class
class SerializableList(List[dict]):
def to_json(self):
return list(map(lambda s: s.to_json(), self))

13
analytics/utils/time.py

@ -0,0 +1,13 @@
import pandas as pd
from typing import List
def convert_sec_to_ms(sec) -> int:
return int(sec) * 1000
def convert_pd_timestamp_to_ms(timestamp: pd.Timestamp) -> int:
# TODO: convert from nanoseconds to millisecond in a better way: not by dividing by 10^6
return int(timestamp.value) // 1000000
def convert_series_to_timestamp_list(series: pd.Series) -> List[int]:
timestamps = map(lambda value: convert_pd_timestamp_to_ms(value), series)
return list(timestamps)

32
bin/server

@ -0,0 +1,32 @@
#!/usr/bin/env python3
import sys
import os
if sys.version_info[:3] < (3, 6, 5) or sys.version_info[:2] >= (3, 7):
sys.stderr.write('Required python is >= 3.6.5 and < 3.7.0 \n')
sys.stderr.write('Your python version is: %d.%d.%d\n' % sys.version_info[:3])
sys.exit(1)
# #TODO: make wrapper script that set PYTHONPATH instead
sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'analytics'))
import logging
root_logger = logging.getLogger()
root_logger.setLevel(logging.DEBUG)
logging_formatter = logging.Formatter("%(asctime)s [Analytics] [%(levelname)-5.5s] %(message)s")
logging_handler = logging.StreamHandler(sys.stdout)
logging_handler.setLevel(logging.DEBUG)
logging_handler.setFormatter(logging_formatter)
root_logger.addHandler(logging_handler)
from server import run_server
if __name__ == "__main__":
run_server()

1
pyinstaller_hooks/hook-pandas.py

@ -0,0 +1 @@
hiddenimports=['pandas._libs.tslibs.timedeltas']

1
pyinstaller_hooks/hook-scipy.py

@ -0,0 +1 @@
hiddenimports=['scipy._lib.messagestream']

7
requirements.txt

@ -0,0 +1,7 @@
attrdict==2.0.0
aiounittest==1.1.0
numpy==1.14.5
pandas==0.20.3
pyzmq==18.0.1
scipy==1.1.0
websockets==8.1

3
scripts/build-dist.sh

@ -0,0 +1,3 @@
#!/bin/bash
cd ..
python3.6 -m PyInstaller --paths=analytics/ --additional-hooks-dir=pyinstaller_hooks bin/server

4
tests/__init__.py

@ -0,0 +1,4 @@
import sys
import os
sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'analytics'))

16
tests/test_analytic_types.py

@ -0,0 +1,16 @@
from analytic_types import TimeSeriesIndex, TimeSeries2
import unittest
class TestDataset(unittest.TestCase):
def test_basic_timeseries_index(self):
tsi = TimeSeriesIndex(['2017-12-31 16:00:00-08:00'])
self.assertEqual(len(tsi), 1)
tsi2 = TimeSeriesIndex(['2017-12-31 16:00:00-08:00', '2017-12-31 17:00:00-08:00', '2017-12-31 18:00:00-08:00'])
self.assertEqual(len(tsi2), 3)
def test_basic_timeseries(self):
tsis = TimeSeriesIndex(['2017-12-31 16:00:00-08:00', '2017-12-31 17:00:00-08:00', '2017-12-31 18:00:00-08:00'])
ts = TimeSeries2([4, 5, 6], tsis)
self.assertEqual(len(ts), 3)

38
tests/test_bucket.py

@ -0,0 +1,38 @@
import unittest
import pandas as pd
import random
from typing import List
from analytic_types.data_bucket import DataBucket
from tests.test_dataset import create_list_of_timestamps
class TestBucket(unittest.TestCase):
def test_receive_data(self):
bucket = DataBucket()
data_val = list(range(6))
timestamp_list = create_list_of_timestamps(len(data_val))
for val in data_val:
bucket.receive_data(get_pd_dataframe([val], [1523889000000 + val]))
for idx, row in bucket.data.iterrows():
self.assertEqual(data_val[idx], row['value'])
self.assertEqual(timestamp_list[idx], row['timestamp'])
def test_drop_data(self):
bucket = DataBucket()
data_val = list(range(10))
timestamp_list = create_list_of_timestamps(len(data_val))
bucket.receive_data(get_pd_dataframe(data_val, timestamp_list))
bucket.drop_data(5)
expected_data = data_val[5:]
expected_timestamp = timestamp_list[5:]
self.assertEqual(expected_data, bucket.data['value'].tolist())
self.assertEqual(expected_timestamp, bucket.data['timestamp'].tolist())
if __name__ == '__main__':
unittest.main()
def get_pd_dataframe(value: List[int], timestamp: List[int]) -> pd.DataFrame:
if len(value) != len(timestamp):
raise ValueError(f'len(value) should be equal to len(timestamp)')
return pd.DataFrame({ 'value': value, 'timestamp': timestamp })

386
tests/test_dataset.py

@ -0,0 +1,386 @@
import unittest
import pandas as pd
import numpy as np
from utils import prepare_data
import models
import random
import scipy.signal
from typing import List
from analytic_types.segment import Segment
class TestDataset(unittest.TestCase):
def test_models_with_corrupted_dataframe(self):
data = [[1523889000000 + i, float('nan')] for i in range(10)]
dataframe = pd.DataFrame(data, columns=['timestamp', 'value'])
segments = []
model_instances = [
models.JumpModel(),
models.DropModel(),
models.GeneralModel(),
models.PeakModel(),
models.TroughModel()
]
for model in model_instances:
model_name = model.__class__.__name__
model.state = model.get_state(None)
with self.assertRaises(AssertionError):
model.fit(dataframe, segments, 'test')
def test_peak_antisegments(self):
data_val = [1.0, 1.0, 1.0, 2.0, 3.0, 2.0, 1.0, 1.0, 1.0, 1.0, 5.0, 7.0, 5.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
dataframe = create_dataframe(data_val)
segments = [{'_id': 'Esl7uetLhx4lCqHa', 'analyticUnitId': 'opnICRJwOmwBELK8', 'from': 1523889000010, 'to': 1523889000012, 'labeled': True, 'deleted': False},
{'_id': 'Esl7uetLhx4lCqHa', 'analyticUnitId': 'opnICRJwOmwBELK8', 'from': 1523889000003, 'to': 1523889000005, 'labeled': False, 'deleted': True}]
segments = [Segment.from_json(segment) for segment in segments]
try:
model = models.PeakModel()
model_name = model.__class__.__name__
model.state = model.get_state(None)
model.fit(dataframe, segments, 'test')
except ValueError:
self.fail('Model {} raised unexpectedly'.format(model_name))
def test_jump_antisegments(self):
data_val = [1.0, 1.0, 1.0, 1.0, 1.0, 5.0, 5.0, 5.0, 5.0, 1.0, 1.0, 1.0, 1.0, 9.0, 9.0, 9.0, 9.0, 9.0, 1.0, 1.0]
dataframe = create_dataframe(data_val)
segments = [{'_id': 'Esl7uetLhx4lCqHa', 'analyticUnitId': 'opnICRJwOmwBELK8', 'from': 1523889000010, 'to': 1523889000016, 'labeled': True, 'deleted': False},
{'_id': 'Esl7uetLhx4lCqHa', 'analyticUnitId': 'opnICRJwOmwBELK8', 'from': 1523889000002, 'to': 1523889000008, 'labeled': False, 'deleted': True}]
segments = [Segment.from_json(segment) for segment in segments]
try:
model = models.JumpModel()
model_name = model.__class__.__name__
model.state = model.get_state(None)
model.fit(dataframe, segments, 'test')
except ValueError:
self.fail('Model {} raised unexpectedly'.format(model_name))
def test_trough_antisegments(self):
data_val = [9.0, 9.0, 9.0, 9.0, 7.0, 4.0, 7.0, 9.0, 9.0, 9.0, 5.0, 1.0, 5.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0]
dataframe = create_dataframe(data_val)
segments = [{'_id': 'Esl7uetLhx4lCqHa', 'analyticUnitId': 'opnICRJwOmwBELK8', 'from': 1523889000010, 'to': 1523889000012, 'labeled': True, 'deleted': False},
{'_id': 'Esl7uetLhx4lCqHa', 'analyticUnitId': 'opnICRJwOmwBELK8', 'from': 1523889000003, 'to': 1523889000005, 'labeled': False, 'deleted': True}]
segments = [Segment.from_json(segment) for segment in segments]
try:
model = models.TroughModel()
model_name = model.__class__.__name__
model.state = model.get_state(None)
model.fit(dataframe, segments, 'test')
except ValueError:
self.fail('Model {} raised unexpectedly'.format(model_name))
def test_drop_antisegments(self):
data_val = [9.0, 9.0, 9.0, 9.0, 9.0, 5.0, 5.0, 5.0, 5.0, 9.0, 9.0, 9.0, 9.0, 1.0, 1.0, 1.0, 1.0, 1.0, 9.0, 9.0]
dataframe = create_dataframe(data_val)
segments = [{'_id': 'Esl7uetLhx4lCqHa', 'analyticUnitId': 'opnICRJwOmwBELK8', 'from': 1523889000010, 'to': 1523889000016, 'labeled': True, 'deleted': False},
{'_id': 'Esl7uetLhx4lCqHa', 'analyticUnitId': 'opnICRJwOmwBELK8', 'from': 1523889000002, 'to': 1523889000008, 'labeled': False, 'deleted': True}]
segments = [Segment.from_json(segment) for segment in segments]
try:
model = models.DropModel()
model_name = model.__class__.__name__
model.state = model.get_state(None)
model.fit(dataframe, segments, 'test')
except ValueError:
self.fail('Model {} raised unexpectedly'.format(model_name))
def test_general_antisegments(self):
data_val = [1.0, 2.0, 1.0, 2.0, 5.0, 6.0, 3.0, 2.0, 1.0, 1.0, 8.0, 9.0, 8.0, 1.0, 2.0, 3.0, 2.0, 1.0, 1.0, 2.0]
dataframe = create_dataframe(data_val)
segments = [{'_id': 'Esl7uetLhx4lCqHa', 'analyticUnitId': 'opnICRJwOmwBELK8', 'from': 1523889000010, 'to': 1523889000012, 'labeled': True, 'deleted': False},
{'_id': 'Esl7uetLhx4lCqHa', 'analyticUnitId': 'opnICRJwOmwBELK8', 'from': 1523889000003, 'to': 1523889000005, 'labeled': False, 'deleted': True}]
segments = [Segment.from_json(segment) for segment in segments]
try:
model = models.GeneralModel()
model_name = model.__class__.__name__
model.state = model.get_state(None)
model.fit(dataframe, segments, 'test')
except ValueError:
self.fail('Model {} raised unexpectedly'.format(model_name))
def test_jump_empty_segment(self):
data_val = [1.0, 1.0, 1.0, 1.0, 1.0, 5.0, 5.0, 5.0, 5.0, 1.0, 1.0, 1.0, 1.0, 9.0, 9.0, 9.0, 9.0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
dataframe = create_dataframe(data_val)
segments = [{'_id': 'Esl7uetLhx4lCqHa', 'analyticUnitId': 'opnICRJwOmwBELK8', 'from': 1523889000019, 'to': 1523889000025, 'labeled': True, 'deleted': False},
{'_id': 'Esl7uetLhx4lCqHa', 'analyticUnitId': 'opnICRJwOmwBELK8', 'from': 1523889000002, 'to': 1523889000008, 'labeled': True, 'deleted': False}]
segments = [Segment.from_json(segment) for segment in segments]
try:
model = models.JumpModel()
model_name = model.__class__.__name__
model.state = model.get_state(None)
model.fit(dataframe, segments, 'test')
except ValueError:
self.fail('Model {} raised unexpectedly'.format(model_name))
def test_drop_empty_segment(self):
data_val = [1.0, 1.0, 1.0, 1.0, 1.0, 5.0, 5.0, 5.0, 5.0, 1.0, 1.0, 1.0, 1.0, 9.0, 9.0, 9.0, 9.0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
dataframe = create_dataframe(data_val)
segments = [{'_id': 'Esl7uetLhx4lCqHa', 'analyticUnitId': 'opnICRJwOmwBELK8', 'from': 1523889000019, 'to': 1523889000025, 'labeled': True, 'deleted': False},
{'_id': 'Esl7uetLhx4lCqHa', 'analyticUnitId': 'opnICRJwOmwBELK8', 'from': 1523889000002, 'to': 1523889000008, 'labeled': True, 'deleted': False}]
segments = [Segment.from_json(segment) for segment in segments]
try:
model = models.DropModel()
model.state = model.get_state(None)
model_name = model.__class__.__name__
model.fit(dataframe, segments, 'test')
except ValueError:
self.fail('Model {} raised unexpectedly'.format(model_name))
def test_value_error_dataset_input_should_have_multiple_elements(self):
data_val = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 5.0, 5.0, 4.0, 5.0, 5.0, 6.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0,3.0,3.0,2.0,7.0,8.0,9.0,8.0,7.0,6.0]
dataframe = create_dataframe(data_val)
segments = [{'_id': 'Esl7uetLhx4lCqHa', 'analyticUnitId': 'opnICRJwOmwBELK8', 'from': 1523889000007, 'to': 1523889000011, 'labeled': True, 'deleted': False}]
segments = [Segment.from_json(segment) for segment in segments]
try:
model = models.JumpModel()
model.state = model.get_state(None)
model_name = model.__class__.__name__
model.fit(dataframe, segments, 'test')
except ValueError:
self.fail('Model {} raised unexpectedly'.format(model_name))
def test_prepare_data_for_nonetype(self):
data = [[1523889000000, None], [1523889000001, None], [1523889000002, None]]
try:
data = prepare_data(data)
except ValueError:
self.fail('Model {} raised unexpectedly'.format(model_name))
def test_prepare_data_for_nan(self):
data = [[1523889000000, np.nan], [1523889000001, np.nan], [1523889000002, np.nan]]
try:
data = prepare_data(data)
except ValueError:
self.fail('Model {} raised unexpectedly'.format(model_name))
def test_prepare_data_output_fon_nan(self):
data_nan = [[1523889000000, np.nan], [1523889000001, np.nan], [1523889000002, np.nan]]
data_none = [[1523889000000, None], [1523889000001, None], [1523889000002, None]]
return_data_nan = prepare_data(data_nan)
return_data_none = prepare_data(data_none)
for item in return_data_nan.value:
self.assertTrue(np.isnan(item))
for item in return_data_none.value:
self.assertTrue(np.isnan(item))
def test_three_value_segment(self):
data_val = [1.0, 1.0, 1.0, 1.0, 1.0, 5.0, 2.0, 5.0, 5.0, 1.0, 1.0, 1.0, 1.0, 9.0, 9.0, 9.0, 9.0, 2.0, 3.0, 4.0, 5.0, 4.0, 2.0, 1.0, 3.0, 4.0]
dataframe = create_dataframe(data_val)
segments = [{'_id': 'Esl7uetLhx4lCqHa', 'analyticUnitId': 'opnICRJwOmwBELK8', 'from': 1523889000004, 'to': 1523889000006, 'labeled': True, 'deleted': False}]
segments = [Segment.from_json(segment) for segment in segments]
model_instances = [
models.GeneralModel(),
models.PeakModel(),
]
try:
for model in model_instances:
model_name = model.__class__.__name__
model.state = model.get_state(None)
model.fit(dataframe, segments, 'test')
except ValueError:
self.fail('Model {} raised unexpectedly'.format(model_name))
def test_general_for_two_labeling(self):
data_val = [1.0, 2.0, 5.0, 2.0, 1.0, 1.0, 3.0, 6.0, 4.0, 2.0, 1.0, 0, 0]
dataframe = create_dataframe(data_val)
segments = [{'_id': 'Esl7uetLhx4lCqHa', 'analyticUnitId': 'opnICRJwOmwBELK8', 'from': 1523889000001, 'to': 1523889000003, 'labeled': True, 'deleted': False}]
segments = [Segment.from_json(segment) for segment in segments]
model = models.GeneralModel()
model.state = model.get_state(None)
model.fit(dataframe, segments,'test')
result = len(data_val) + 1
for _ in range(2):
model.do_detect(dataframe)
max_pattern_index = max(model.do_detect(dataframe))
self.assertLessEqual(max_pattern_index[0], result)
def test_peak_model_for_cache(self):
cache = {
'patternCenter': [1, 6],
'patternModel': [1, 4, 0],
'confidence': 2,
'convolveMax': 8,
'convolveMin': 7,
'windowSize': 1,
'convDelMin': 0,
'convDelMax': 0,
'heightMax': 4,
'heightMin': 4,
}
data_val = [2.0, 5.0, 1.0, 1.0, 1.0, 2.0, 5.0, 1.0, 1.0, 2.0, 3.0, 7.0, 1.0, 1.0, 1.0]
dataframe = create_dataframe(data_val)
segments = [{'_id': 'Esl7uetLhx4lCqHa', 'analyticUnitId': 'opnICRJwOmwBELK8', 'from': 1523889000010, 'to': 1523889000012, 'labeled': True, 'deleted': False}]
segments = [Segment.from_json(segment) for segment in segments]
model = models.PeakModel()
model.state = model.get_state(cache)
result = model.fit(dataframe, segments, 'test')
self.assertEqual(len(result.pattern_center), 3)
def test_trough_model_for_cache(self):
cache = {
'patternCenter': [2, 6],
'patternModel': [5, 0.5, 4],
'confidence': 2,
'convolveMax': 8,
'convolveMin': 7,
'window_size': 1,
'convDelMin': 0,
'convDelMax': 0,
}
data_val = [5.0, 5.0, 1.0, 4.0, 5.0, 5.0, 0.0, 4.0, 5.0, 5.0, 6.0, 1.0, 5.0, 5.0, 5.0]
dataframe = create_dataframe(data_val)
segments = [{'_id': 'Esl7uetLhx4lCqHa', 'analyticUnitId': 'opnICRJwOmwBELK8', 'from': 1523889000010, 'to': 1523889000012, 'labeled': True, 'deleted': False}]
segments = [Segment.from_json(segment) for segment in segments]
model = models.TroughModel()
model.state = model.get_state(cache)
result = model.fit(dataframe, segments, 'test')
self.assertEqual(len(result.pattern_center), 3)
def test_jump_model_for_cache(self):
cache = {
'patternCenter': [2, 6],
'patternModel': [5, 0.5, 4],
'confidence': 2,
'convolveMax': 8,
'convolveMin': 7,
'window_size': 1,
'convDelMin': 0,
'convDelMax': 0,
}
data_val = [1.0, 1.0, 1.0, 4.0, 4.0, 0.0, 0.0, 5.0, 5.0, 0.0, 0.0, 4.0, 4.0, 4.0, 4.0]
dataframe = create_dataframe(data_val)
segments = [{'_id': 'Esl7uetLhx4lCqHa', 'analyticUnitId': 'opnICRJwOmwBELK8', 'from': 152388900009, 'to': 1523889000013, 'labeled': True, 'deleted': False}]
segments = [Segment.from_json(segment) for segment in segments]
model = models.JumpModel()
model.state = model.get_state(cache)
result = model.fit(dataframe, segments, 'test')
self.assertEqual(len(result.pattern_center), 3)
def test_models_for_pattern_model_cache(self):
cache = {
'patternCenter': [4, 12],
'patternModel': [],
'confidence': 2,
'convolveMax': 8,
'convolveMin': 7,
'window_size': 2,
'convDelMin': 0,
'convDelMax': 0,
}
data_val = [5.0, 5.0, 5.0, 5.0, 1.0, 1.0, 1.0, 1.0, 9.0, 9.0, 9.0, 9.0, 0, 0, 0, 0, 0, 0, 6.0, 6.0, 6.0, 1.0, 1.0, 1.0, 1.0, 1.0]
dataframe = create_dataframe(data_val)
segments = [{'_id': 'Esl7uetLhx4lCqHa', 'analyticUnitId': 'opnICRJwOmwBELK8', 'from': 1523889000019, 'to': 1523889000024, 'labeled': True, 'deleted': False}]
segments = [Segment.from_json(segment) for segment in segments]
try:
model = models.DropModel()
model_name = model.__class__.__name__
model.state = model.get_state(cache)
model.fit(dataframe, segments, 'test')
except ValueError:
self.fail('Model {} raised unexpectedly'.format(model_name))
def test_problem_data_for_random_model(self):
problem_data = [2.0, 3.0, 3.0, 3.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0,
3.0, 3.0, 3.0, 5.0, 5.0, 5.0, 5.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 2.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0,
3.0, 3.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 2.0, 2.0, 2.0, 6.0, 7.0, 8.0, 8.0, 4.0, 2.0, 2.0, 3.0, 3.0, 3.0, 4.0,
4.0, 4.0, 4.0, 3.0, 3.0, 3.0, 3.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 3.0,
4.0, 4.0, 4.0, 4.0, 4.0, 6.0, 5.0, 4.0, 4.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 2.0, 3.0, 3.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0,
2.0, 8.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0]
data = create_dataframe(problem_data)
cache = {
'patternCenter': [5, 50],
'patternModel': [],
'windowSize': 2,
'convolveMin': 0,
'convolveMax': 0,
'convDelMin': 0,
'convDelMax': 0,
}
max_ws = 20
iteration = 1
for ws in range(1, max_ws):
for _ in range(iteration):
pattern_model = create_random_model(ws)
convolve = scipy.signal.fftconvolve(pattern_model, pattern_model)
cache['windowSize'] = ws
cache['patternModel'] = pattern_model
cache['convolveMin'] = max(convolve)
cache['convolveMax'] = max(convolve)
try:
model = models.GeneralModel()
model.state = model.get_state(cache)
model_name = model.__class__.__name__
model.detect(data, 'test')
except ValueError:
self.fail('Model {} raised unexpectedly with av_model {} and window size {}'.format(model_name, pattern_model, ws))
def test_random_dataset_for_random_model(self):
data = create_random_model(random.randint(1, 100))
data = create_dataframe(data)
model_instances = [
models.PeakModel(),
models.TroughModel()
]
cache = {
'patternCenter': [5, 50],
'patternModel': [],
'windowSize': 2,
'convolveMin': 0,
'convolveMax': 0,
'confidence': 0,
'heightMax': 0,
'heightMin': 0,
'convDelMin': 0,
'convDelMax': 0,
}
ws = random.randint(1, int(len(data['value']/2)))
pattern_model = create_random_model(ws)
convolve = scipy.signal.fftconvolve(pattern_model, pattern_model)
confidence = 0.2 * (data['value'].max() - data['value'].min())
cache['windowSize'] = ws
cache['patternModel'] = pattern_model
cache['convolveMin'] = max(convolve)
cache['convolveMax'] = max(convolve)
cache['confidence'] = confidence
cache['heightMax'] = data['value'].max()
cache['heightMin'] = confidence
try:
for model in model_instances:
model_name = model.__class__.__name__
model.state = model.get_state(cache)
model.detect(data, 'test')
except ValueError:
self.fail('Model {} raised unexpectedly with dataset {} and cache {}'.format(model_name, data['value'], cache))
if __name__ == '__main__':
unittest.main()
def create_dataframe(data_val: list) -> pd.DataFrame:
data_ind = create_list_of_timestamps(len(data_val))
data = {'timestamp': data_ind, 'value': data_val}
dataframe = pd.DataFrame(data)
dataframe['timestamp'] = pd.to_datetime(dataframe['timestamp'], unit='ms')
return dataframe
def create_list_of_timestamps(length: int) -> List[int]:
return [1523889000000 + i for i in range(length)]
def create_random_model(window_size: int) -> list:
return [random.randint(0, 100) for _ in range(window_size * 2 + 1)]

265
tests/test_detectors.py

@ -0,0 +1,265 @@
import unittest
import pandas as pd
from detectors import pattern_detector, threshold_detector, anomaly_detector
from analytic_types.detector import DetectionResult, ProcessingResult, Bound
from analytic_types.segment import Segment
from tests.test_dataset import create_dataframe, create_list_of_timestamps
from utils import convert_pd_timestamp_to_ms
class TestPatternDetector(unittest.TestCase):
def test_small_dataframe(self):
data = [[0,1], [1,2]]
dataframe = pd.DataFrame(data, columns=['timestamp', 'values'])
cache = { 'windowSize': 10 }
detector = pattern_detector.PatternDetector('GENERAL', 'test_id')
with self.assertRaises(ValueError):
detector.detect(dataframe, cache)
def test_only_negative_segments(self):
data_val = [0, 1, 2, 1, 2, 10, 1, 2, 1]
data_ind = [1523889000000 + i for i in range(len(data_val))]
data = {'timestamp': data_ind, 'value': data_val}
dataframe = pd.DataFrame(data = data)
segments = [{'_id': 'Esl7uetLhx4lCqHa', 'analyticUnitId': 'opnICRJwOmwBELK8', 'from': 1523889000019, 'to': 1523889000025, 'labeled': False, 'deleted': False},
{'_id': 'Esl7uetLhx4lCqHa', 'analyticUnitId': 'opnICRJwOmwBELK8', 'from': 1523889000002, 'to': 1523889000008, 'labeled': False, 'deleted': False}]
segments = [Segment.from_json(segment) for segment in segments]
cache = {}
detector = pattern_detector.PatternDetector('PEAK', 'test_id')
excepted_error_message = 'test_id has no positive labeled segments. Pattern detector needs at least 1 positive labeled segment'
try:
detector.train(dataframe, segments, cache)
except ValueError as e:
self.assertEqual(str(e), excepted_error_message)
def test_positive_and_negative_segments(self):
data_val = [1.0, 1.0, 1.0, 2.0, 3.0, 2.0, 1.0, 1.0, 1.0, 1.0, 5.0, 7.0, 5.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
dataframe = create_dataframe(data_val)
segments = [{'_id': 'Esl7uetLhx4lCqHa', 'analyticUnitId': 'opnICRJwOmwBELK8', 'from': 1523889000004, 'to': 1523889000006, 'labeled': True, 'deleted': False},
{'_id': 'Esl7uetLhx4lCqHa', 'analyticUnitId': 'opnICRJwOmwBELK8', 'from': 1523889000001, 'to': 1523889000003, 'labeled': False, 'deleted': False}]
segments = [Segment.from_json(segment) for segment in segments]
cache = {}
detector = pattern_detector.PatternDetector('PEAK', 'test_id')
try:
detector.train(dataframe, segments, cache)
except Exception as e:
self.fail('detector.train fail with error {}'.format(e))
class TestThresholdDetector(unittest.TestCase):
def test_invalid_cache(self):
detector = threshold_detector.ThresholdDetector('test_id')
with self.assertRaises(ValueError):
detector.detect([], None)
with self.assertRaises(ValueError):
detector.detect([], {})
class TestAnomalyDetector(unittest.TestCase):
def test_detect(self):
data_val = [0, 1, 2, 1, 2, 10, 1, 2, 1]
data_ind = [1523889000000 + i for i in range(len(data_val))]
data = {'timestamp': data_ind, 'value': data_val}
dataframe = pd.DataFrame(data = data)
dataframe['timestamp'] = pd.to_datetime(dataframe['timestamp'], unit='ms')
cache = {
'confidence': 2,
'alpha': 0.1,
'enableBounds': 'ALL',
'timeStep': 1
}
detector = anomaly_detector.AnomalyDetector('test_id')
detect_result: DetectionResult = detector.detect(dataframe, cache)
detected_segments = list(map(lambda s: {'from': s.from_timestamp, 'to': s.to_timestamp}, detect_result.segments))
result = [{ 'from': 1523889000005.0, 'to': 1523889000005.0 }]
self.assertEqual(result, detected_segments)
cache = {
'confidence': 2,
'alpha': 0.1,
'enableBounds': 'ALL',
'timeStep': 1,
'seasonality': 4,
'segments': [{ 'from': 1523889000001, 'to': 1523889000002, 'data': [10] }]
}
detect_result: DetectionResult = detector.detect(dataframe, cache)
detected_segments = list(map(lambda s: {'from': s.from_timestamp, 'to': s.to_timestamp}, detect_result.segments))
result = []
self.assertEqual(result, detected_segments)
def test_process_data(self):
data_val = [0, 1, 2, 1, 2, 10, 1, 2, 1]
data_ind = [1523889000000 + i for i in range(len(data_val))]
data = {'timestamp': data_ind, 'value': data_val}
dataframe = pd.DataFrame(data = data)
dataframe['timestamp'] = pd.to_datetime(dataframe['timestamp'], unit='ms')
cache = {
'confidence': 2,
'alpha': 0.1,
'enableBounds': 'ALL',
'timeStep': 1
}
detector = anomaly_detector.AnomalyDetector('test_id')
detect_result: ProcessingResult = detector.process_data(dataframe, cache)
expected_result = {
'lowerBound': [
(1523889000000, -2.0),
(1523889000001, -1.9),
(1523889000002, -1.71),
(1523889000003, -1.6389999999999998),
(1523889000004, -1.4750999999999999),
(1523889000005, -0.5275899999999998),
(1523889000006, -0.5748309999999996),
(1523889000007, -0.5173478999999996),
(1523889000008, -0.5656131099999995)
],
'upperBound': [
(1523889000000, 2.0),
(1523889000001, 2.1),
(1523889000002, 2.29),
(1523889000003, 2.361),
(1523889000004, 2.5249),
(1523889000005, 3.47241),
(1523889000006, 3.4251690000000004),
(1523889000007, 3.4826521),
(1523889000008, 3.4343868900000007)
]}
self.assertEqual(detect_result.to_json(), expected_result)
cache = {
'confidence': 2,
'alpha': 0.1,
'enableBounds': 'ALL',
'timeStep': 1,
'seasonality': 5,
'segments': [{ 'from': 1523889000001, 'to': 1523889000002,'data': [1] }]
}
detect_result: ProcessingResult = detector.process_data(dataframe, cache)
expected_result = {
'lowerBound': [
(1523889000000, -2.0),
(1523889000001, -2.9),
(1523889000002, -1.71),
(1523889000003, -1.6389999999999998),
(1523889000004, -1.4750999999999999),
(1523889000005, -0.5275899999999998),
(1523889000006, -1.5748309999999996),
(1523889000007, -0.5173478999999996),
(1523889000008, -0.5656131099999995)
],
'upperBound': [
(1523889000000, 2.0),
(1523889000001, 3.1),
(1523889000002, 2.29),
(1523889000003, 2.361),
(1523889000004, 2.5249),
(1523889000005, 3.47241),
(1523889000006, 4.425169),
(1523889000007, 3.4826521),
(1523889000008, 3.4343868900000007)
]}
self.assertEqual(detect_result.to_json(), expected_result)
def test_get_seasonality_offset(self):
detector = anomaly_detector.AnomalyDetector('test_id')
from_timestamp = 1573700973027
seasonality = 3600000
data_start_time = 1573698780000
time_step = 30000
detected_offset = detector.get_seasonality_offset(from_timestamp, seasonality, data_start_time, time_step)
expected_offset = 74
self.assertEqual(detected_offset, expected_offset)
def test_segment_generator(self):
detector = anomaly_detector.AnomalyDetector('test_id')
data = [1, 1, 5, 1, -4, 5, 5, 5, -3, 1]
timestamps = create_list_of_timestamps(len(data))
dataframe = create_dataframe(data)
upper_bound = pd.Series([2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
lower_bound = pd.Series([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
segments = list(detector.detections_generator(dataframe, upper_bound, lower_bound, enabled_bounds=Bound.ALL))
segments_borders = list(map(lambda s: [s.from_timestamp, s.to_timestamp], segments))
self.assertEqual(segments_borders, [[timestamps[2], timestamps[2]], [timestamps[4], timestamps[8]]])
def test_consume_data(self):
cache = {
'confidence': 2,
'alpha': 0.1,
'enableBounds': 'ALL',
'timeStep': 1
}
detector = anomaly_detector.AnomalyDetector('test_id')
detect_result: DetectionResult = None
for val in range(22):
value = 1 if val != 10 else 5
dataframe = pd.DataFrame({'value': [value], 'timestamp': [1523889000000 + val]})
dataframe['timestamp'] = pd.to_datetime(dataframe['timestamp'], unit='ms')
detect_result = detector.consume_data(dataframe, cache)
detected_segments = list(map(lambda s: {'from': s.from_timestamp, 'to': s.to_timestamp}, detect_result.segments))
result = [{ 'from': 1523889000010, 'to': 1523889000010 }]
self.assertEqual(result, detected_segments)
def test_get_segment_bound(self):
detector = anomaly_detector.AnomalyDetector('test_id')
peak_segment = pd.Series([1,2,3,4,3,2,1])
trough_segment = pd.Series([4,3,2,1,2,3,4])
expected_peak_segment_results = {
'max_value': 3,
'min_value': 1.5
}
expected_trough_segment_results = {
'max_value': 3.5,
'min_value': 2.75
}
peak_detector_result_upper = detector.get_segment_bound(peak_segment, Bound.UPPER)
peak_detector_result_lower = detector.get_segment_bound(peak_segment, Bound.LOWER)
trough_detector_result_upper = detector.get_segment_bound(trough_segment, Bound.UPPER)
trough_detector_result_lower = detector.get_segment_bound(trough_segment, Bound.LOWER)
self.assertGreaterEqual(
max(peak_detector_result_upper),
expected_peak_segment_results['max_value']
)
self.assertLessEqual(
max(peak_detector_result_lower),
expected_peak_segment_results['min_value']
)
self.assertGreaterEqual(
max(trough_detector_result_upper),
expected_trough_segment_results['max_value']
)
self.assertLessEqual(
max(trough_detector_result_lower),
expected_trough_segment_results['min_value']
)
def test_get_segment_bound_corner_cases(self):
detector = anomaly_detector.AnomalyDetector('test_id')
empty_segment = pd.Series([])
same_values_segment = pd.Series([2,2,2,2,2,2])
empty_detector_result_upper = detector.get_segment_bound(empty_segment, Bound.UPPER)
empty_detector_result_lower = detector.get_segment_bound(empty_segment, Bound.LOWER)
same_values_detector_result_upper = detector.get_segment_bound(same_values_segment, Bound.UPPER)
same_values_detector_result_lower = detector.get_segment_bound(same_values_segment, Bound.LOWER)
self.assertEqual(len(empty_detector_result_upper), 0)
self.assertEqual(len(empty_detector_result_lower), 0)
self.assertEqual(min(same_values_detector_result_upper), 0)
self.assertEqual(max(same_values_detector_result_upper), 0)
self.assertEqual(min(same_values_detector_result_lower), 0)
self.assertEqual(max(same_values_detector_result_lower), 0)
if __name__ == '__main__':
unittest.main()

100
tests/test_manager.py

@ -0,0 +1,100 @@
from models import PeakModel, DropModel, TroughModel, JumpModel, GeneralModel
from models import GeneralModelState
import utils.meta
import aiounittest
from analytic_unit_manager import AnalyticUnitManager
from collections import namedtuple
TestData = namedtuple('TestData', ['uid', 'type', 'values', 'segments'])
def get_random_id() -> str:
return str(id(list()))
class TestDataset(aiounittest.AsyncTestCase):
timestep = 50 #ms
def _fill_task(self, uid, data, task_type, analytic_unit_type, segments=None, cache=None):
task = {
'analyticUnitId': uid,
'type': task_type,
'payload': {
'data': data,
'from': data[0][0],
'to': data[-1][0],
'analyticUnitType': analytic_unit_type,
'detector': 'pattern',
'cache': cache
},
'_id': get_random_id()
}
if segments: task['payload']['segments'] = segments
return task
def _convert_values(self, values) -> list:
from_t = 0
to_t = len(values) * self.timestep
return list(zip(range(from_t, to_t, self.timestep), values))
def _index_to_test_time(self, idx) -> int:
return idx * self.timestep
def _get_learn_task(self, test_data):
uid, analytic_unit_type, values, segments = test_data
data = self._convert_values(values)
segments = [{
'analyticUnitId': uid,
'from': self._index_to_test_time(s[0]),
'to': self._index_to_test_time(s[1]),
'labeled': True,
'deleted': False
} for s in segments]
return self._fill_task(uid, data, 'LEARN', analytic_unit_type, segments=segments)
def _get_detect_task(self, test_data, cache):
uid, analytic_unit_type, values, _ = test_data
data = self._convert_values(values)
return self._fill_task(uid, data, 'DETECT', analytic_unit_type, cache=cache)
def _get_test_dataset(self, pattern) -> tuple:
"""
pattern name: ([dataset values], [list of segments])
segment - (begin, end) - indexes in dataset values
returns dataset in format (data: List[int], segments: List[List[int]])
"""
datasets = {
'PEAK': ([0, 0, 1, 2, 3, 4, 3, 2, 1, 0, 0], [[2, 8]]),
'JUMP': ([0, 0, 1, 2, 3, 4, 4, 4], [[1, 6]]),
'DROP': ([4, 4, 4, 3, 2, 1, 0, 0], [[1, 6]]),
'TROUGH': ([4, 4, 3, 2, 1, 0, 1, 2, 3, 4, 4], [[1, 9]]),
'GENERAL': ([0, 0, 1, 2, 3, 4, 3, 2, 1, 0, 0], [[2, 8]])
}
return datasets[pattern]
async def _learn(self, task, manager=None) -> dict:
if not manager: manager = AnalyticUnitManager()
result = await manager.handle_analytic_task(task)
return result['payload']['cache']
async def _detect(self, task, manager=None) -> dict:
if not manager: manager = AnalyticUnitManager()
result = await manager.handle_analytic_task(task)
return result
async def _test_detect(self, test_data, manager=None):
learn_task = self._get_learn_task(test_data)
cache = await self._learn(learn_task, manager)
detect_task = self._get_detect_task(test_data, cache)
result = await self._detect(detect_task, manager)
return result
async def test_unit_manager(self):
test_data = TestData(get_random_id(), 'PEAK', [0,1,2,5,10,5,2,1,1,1,0,0,0,0], [[1,7]])
manager = AnalyticUnitManager()
with_manager = await self._test_detect(test_data, manager)
without_manager = await self._test_detect(test_data)
self.assertEqual(with_manager, without_manager)

43
tests/test_models.py

@ -0,0 +1,43 @@
import unittest
import pandas as pd
import numpy as np
import models
class TestModel(unittest.TestCase):
def test_stair_model_get_indexes(self):
drop_model = models.DropModel()
jump_model = models.JumpModel()
drop_data = pd.Series([4, 4, 4, 1, 1, 1, 5, 5, 2, 2, 2])
jump_data = pd.Series([1, 1, 1, 4, 4, 4, 2, 2, 5, 5, 5])
jump_data_one_stair = pd.Series([1, 3, 3])
drop_data_one_stair = pd.Series([4, 2, 1])
height = 2
length = 2
expected_result = [2, 7]
drop_model_result = drop_model.get_stair_indexes(drop_data, height, length)
jump_model_result = jump_model.get_stair_indexes(jump_data, height, length)
drop_one_stair_result = drop_model.get_stair_indexes(drop_data_one_stair, height, 1)
jump_one_stair_result = jump_model.get_stair_indexes(jump_data_one_stair, height, 1)
for val in expected_result:
self.assertIn(val, drop_model_result)
self.assertIn(val, jump_model_result)
self.assertEqual(0, drop_one_stair_result[0])
self.assertEqual(0, jump_one_stair_result[0])
def test_stair_model_get_indexes_corner_cases(self):
drop_model = models.DropModel()
jump_model = models.JumpModel()
empty_data = pd.Series([])
nan_data = pd.Series([np.nan, np.nan, np.nan, np.nan])
height, length = 2, 2
length_zero, height_zero = 0, 0
expected_result = []
drop_empty_data_result = drop_model.get_stair_indexes(empty_data, height, length)
drop_nan_data_result = drop_model.get_stair_indexes(nan_data, height_zero, length_zero)
jump_empty_data_result = jump_model.get_stair_indexes(empty_data, height, length)
jump_nan_data_result = jump_model.get_stair_indexes(nan_data, height_zero, length_zero)
self.assertEqual(drop_empty_data_result, expected_result)
self.assertEqual(drop_nan_data_result, expected_result)
self.assertEqual(jump_empty_data_result, expected_result)
self.assertEqual(jump_nan_data_result, expected_result)

359
tests/test_utils.py

@ -0,0 +1,359 @@
from analytic_types.segment import Segment
import utils
import unittest
import numpy as np
import pandas as pd
import math
import random
RELATIVE_TOLERANCE = 1e-1
class TestUtils(unittest.TestCase):
#example test for test's workflow purposes
def test_segment_parsion(self):
self.assertTrue(True)
def test_confidence_all_normal_value(self):
segment = [1, 2, 0, 6, 8, 5, 3]
utils_result = utils.find_confidence(segment)[0]
result = 4.0
self.assertTrue(math.isclose(utils_result, result, rel_tol = RELATIVE_TOLERANCE))
def test_confidence_all_nan_value(self):
segment = [np.nan, np.nan, np.nan, np.nan]
self.assertEqual(utils.find_confidence(segment)[0], 0)
def test_confidence_with_nan_value(self):
data = [np.nan, np.nan, 0, 8]
utils_result = utils.find_confidence(data)[0]
result = 4.0
self.assertTrue(math.isclose(utils_result, result, rel_tol = RELATIVE_TOLERANCE))
def test_interval_all_normal_value(self):
data = [1, 2, 1, 2, 4, 1, 2, 4, 5, 6]
data = pd.Series(data)
center = 4
window_size = 2
result = [1, 2, 4, 1, 2]
self.assertEqual(list(utils.get_interval(data, center, window_size)), result)
def test_interval_wrong_ws(self):
data = [1, 2, 4, 1, 2, 4]
data = pd.Series(data)
center = 3
window_size = 6
result = [1, 2, 4, 1, 2, 4]
self.assertEqual(list(utils.get_interval(data, center, window_size)), result)
def test_subtract_min_without_nan(self):
segment = [1, 2, 4, 1, 2, 4]
segment = pd.Series(segment)
result = [0, 1, 3, 0, 1, 3]
utils_result = list(utils.subtract_min_without_nan(segment))
self.assertEqual(utils_result, result)
def test_subtract_min_with_nan(self):
segment = [np.nan, 2, 4, 1, 2, 4]
segment = pd.Series(segment)
result = [2, 4, 1, 2, 4]
utils_result = list(utils.subtract_min_without_nan(segment)[1:])
self.assertEqual(utils_result, result)
def test_get_convolve(self):
data = [1, 2, 3, 2, 2, 0, 2, 3, 4, 3, 2, 1, 1, 2, 3, 4, 3, 2, 0]
data = pd.Series(data)
pattern_index = [2, 8, 15]
window_size = 2
av_model = [1, 2, 3, 2, 1]
result = []
self.assertNotEqual(utils.get_convolve(pattern_index, av_model, data, window_size), result)
def test_get_convolve_with_nan(self):
data = [1, 2, 3, 2, np.nan, 0, 2, 3, 4, np.nan, 2, 1, 1, 2, 3, 4, 3, np.nan, 0]
data = pd.Series(data)
pattern_index = [2, 8, 15]
window_size = 2
av_model = [1, 2, 3, 2, 1]
result = utils.get_convolve(pattern_index, av_model, data, window_size)
for val in result:
self.assertFalse(np.isnan(val))
def test_get_convolve_empty_data(self):
data = []
pattern_index = []
window_size = 2
window_size_zero = 0
av_model = []
result = []
self.assertEqual(utils.get_convolve(pattern_index, av_model, data, window_size), result)
self.assertEqual(utils.get_convolve(pattern_index, av_model, data, window_size_zero), result)
def test_find_jump_parameters_center(self):
segment = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5]
segment = pd.Series(segment)
jump_center = [10, 11]
self.assertIn(utils.find_pattern_center(segment, 0, 'jump'), jump_center)
def test_find_jump_parameters_height(self):
segment = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5]
segment = pd.Series(segment)
jump_height = [3.5, 4]
self.assertGreaterEqual(utils.find_parameters(segment, 0, 'jump')[0], jump_height[0])
self.assertLessEqual(utils.find_parameters(segment, 0, 'jump')[0], jump_height[1])
def test_find_jump_parameters_length(self):
segment = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5]
segment = pd.Series(segment)
jump_length = 2
self.assertEqual(utils.find_parameters(segment, 0, 'jump')[1], jump_length)
def test_find_drop_parameters_center(self):
segment = [5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
segment = pd.Series(segment)
drop_center = [14, 15, 16]
self.assertIn(utils.find_pattern_center(segment, 0, 'drop'), drop_center)
def test_find_drop_parameters_height(self):
segment = [5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
segment = pd.Series(segment)
drop_height = [3.5, 4]
self.assertGreaterEqual(utils.find_parameters(segment, 0, 'drop')[0], drop_height[0])
self.assertLessEqual(utils.find_parameters(segment, 0, 'drop')[0], drop_height[1])
def test_find_drop_parameters_length(self):
segment = [5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
segment = pd.Series(segment)
drop_length = 2
self.assertEqual(utils.find_parameters(segment, 0, 'drop')[1], drop_length)
def test_get_av_model_empty_data(self):
patterns_list = []
result = []
self.assertEqual(utils.get_av_model(patterns_list), result)
def test_get_av_model_normal_data(self):
patterns_list = [[1, 1, 1], [2, 2, 2],[3,3,3]]
result = [2.0, 2.0, 2.0]
self.assertEqual(utils.get_av_model(patterns_list), result)
def test_get_distribution_density(self):
segment = [1, 1, 1, 3, 5, 5, 5]
segment = pd.Series(segment)
result = (3, 5, 1)
self.assertEqual(utils.get_distribution_density(segment), result)
def test_get_distribution_density_right(self):
data = [1.0, 5.0, 5.0, 4.0]
data = pd.Series(data)
median = 3.0
max_line = 5.0
min_line = 1.0
utils_result = utils.get_distribution_density(data)
self.assertTrue(math.isclose(utils_result[0], median, rel_tol = RELATIVE_TOLERANCE))
self.assertTrue(math.isclose(utils_result[1], max_line, rel_tol = RELATIVE_TOLERANCE))
self.assertTrue(math.isclose(utils_result[2], min_line, rel_tol = RELATIVE_TOLERANCE))
def test_get_distribution_density_left(self):
data = [1.0, 1.0, 2.0, 1.0, 5.0]
data = pd.Series(data)
median = 3.0
max_line = 5.0
min_line = 1.0
utils_result = utils.get_distribution_density(data)
self.assertTrue(math.isclose(utils_result[0], median, rel_tol = RELATIVE_TOLERANCE))
self.assertTrue(math.isclose(utils_result[1], max_line, rel_tol = RELATIVE_TOLERANCE))
self.assertTrue(math.isclose(utils_result[2], min_line, rel_tol = RELATIVE_TOLERANCE))
def test_get_distribution_density_short_data(self):
data = [1.0, 5.0]
data = pd.Series(data)
segment = [1.0]
segment = pd.Series(segment)
utils_result_data = utils.get_distribution_density(data)
utils_result_segment = utils.get_distribution_density(segment)
self.assertEqual(len(utils_result_data), 3)
self.assertEqual(utils_result_segment, (0, 0, 0))
def test_get_distribution_density_with_nans(self):
segment = [np.NaN, 1, 1, 1, np.NaN, 3, 5, 5, 5, np.NaN]
segment = pd.Series(segment)
result = (3, 5, 1)
self.assertEqual(utils.get_distribution_density(segment), result)
def test_find_pattern_jump_center(self):
data = [1.0, 1.0, 1.0, 5.0, 5.0, 5.0]
data = pd.Series(data)
median = 3.0
result = 3
self.assertEqual(result, utils.find_pattern_center(data, 0, 'jump'))
def test_get_convolve_wrong_index(self):
data = [1.0, 5.0, 2.0, 1.0, 6.0, 2.0]
data = pd.Series(data)
segemnts = [1, 11]
av_model = [0.0, 4.0, 0.0]
window_size = 1
try:
utils.get_convolve(segemnts, av_model, data, window_size)
except ValueError:
self.fail('Method get_convolve raised unexpectedly')
def test_get_av_model_for_different_length(self):
patterns_list = [[1.0, 1.0, 2.0], [4.0, 4.0], [2.0, 2.0, 2.0], [3.0, 3.0], []]
try:
utils.get_av_model(patterns_list)
except ValueError:
self.fail('Method get_convolve raised unexpectedly')
def test_find_nan_indexes(self):
data = [1, 1, 1, 0, 0, np.nan, None, []]
data = pd.Series(data)
result = [5, 6]
self.assertEqual(utils.find_nan_indexes(data), result)
def test_find_nan_indexes_normal_values(self):
data = [1, 1, 1, 0, 0, 0, 1, 1]
data = pd.Series(data)
result = []
self.assertEqual(utils.find_nan_indexes(data), result)
def test_find_nan_indexes_empty_values(self):
data = []
result = []
self.assertEqual(utils.find_nan_indexes(data), result)
def test_create_correlation_data(self):
data = [random.randint(10, 999) for _ in range(10000)]
data = pd.Series(data)
pattern_model = [100, 200, 500, 300, 100]
ws = 2
result = 6000
corr_data = utils.get_correlation_gen(data, ws, pattern_model)
corr_data = list(corr_data)
self.assertGreaterEqual(len(corr_data), result)
def test_inverse_segment(self):
data = pd.Series([1,2,3,4,3,2,1])
result = pd.Series([3,2,1,0,1,2,3])
utils_result = utils.inverse_segment(data)
for ind, val in enumerate(utils_result):
self.assertEqual(val, result[ind])
def test_get_end_of_segment_equal(self):
data = pd.Series([5,4,3,2,1,0,0,0])
result_list = [4, 5, 6]
self.assertIn(utils.get_end_of_segment(data, False), result_list)
def test_get_end_of_segment_greater(self):
data = pd.Series([5,4,3,2,1,0,1,2,3])
result_list = [4, 5, 6]
self.assertIn(utils.get_end_of_segment(data, False), result_list)
def test_get_borders_of_peaks(self):
data = pd.Series([1,0,1,2,3,2,1,0,0,1,2,3,4,3,2,2,1,0,1,2,3,4,5,3,2,1,0])
pattern_center = [4, 12, 22]
ws = 3
confidence = 1.5
result = [(1, 7), (9, 15), (19, 25)]
self.assertEqual(utils.get_borders_of_peaks(pattern_center, data, ws, confidence), result)
def test_get_borders_of_peaks_for_trough(self):
data = pd.Series([4,4,5,5,3,1,3,5,5,6,3,2])
pattern_center = [5]
ws = 5
confidence = 3
result = [(3, 7)]
self.assertEqual(utils.get_borders_of_peaks(pattern_center, data, ws, confidence, inverse = True), result)
def test_get_start_and_end_of_segments(self):
segments = [[1, 2, 3, 4], [5, 6, 7], [8], [], [12, 12]]
result = [[1, 4], [5, 7], [8, 8], [12, 12]]
utils_result = utils.get_start_and_end_of_segments(segments)
for got, expected in zip(utils_result, result):
self.assertEqual(got, expected)
def test_get_start_and_end_of_segments_empty(self):
segments = []
result = []
utils_result = utils.get_start_and_end_of_segments(segments)
self.assertEqual(result, utils_result)
def test_merge_intersecting_segments(self):
test_cases = [
{
'index': [Segment(10, 20), Segment(30, 40)],
'result': [[10, 20], [30, 40]],
'step': 0,
},
{
'index': [Segment(10, 20), Segment(13, 23), Segment(15, 17), Segment(20, 40)],
'result': [[10, 40]],
'step': 0,
},
{
'index': [],
'result': [],
'step': 0,
},
{
'index': [Segment(10, 20)],
'result': [[10, 20]],
'step': 0,
},
{
'index': [Segment(10, 20), Segment(13, 23), Segment(25, 30), Segment(35, 40)],
'result': [[10, 23], [25, 30], [35, 40]],
'step': 0,
},
{
'index': [Segment(10, 50), Segment(5, 40), Segment(15, 25), Segment(6, 50)],
'result': [[5, 50]],
'step': 0,
},
{
'index': [Segment(5, 10), Segment(10, 20), Segment(25, 50)],
'result': [[5, 20], [25, 50]],
'step': 0,
},
{
'index': [Segment(20, 40), Segment(10, 15), Segment(50, 60)],
'result': [[10, 15], [20, 40], [50, 60]],
'step': 0,
},
{
'index': [Segment(20, 40), Segment(10, 20), Segment(50, 60)],
'result': [[10, 40], [50, 60]],
'step': 0,
},
{
'index': [Segment(10, 10), Segment(20, 20), Segment(30, 30)],
'result': [[10, 30]],
'step': 10,
},
]
for case in test_cases:
utils_result = utils.merge_intersecting_segments(case['index'], case['step'])
for got, expected in zip(utils_result, case['result']):
self.assertEqual(got.from_timestamp, expected[0])
self.assertEqual(got.to_timestamp, expected[1])
def test_serialize(self):
segment_list = [Segment(100,200)]
serialize_list = utils.meta.SerializableList(segment_list)
meta_result = utils.meta.serialize(serialize_list)
expected_result = [{ 'from': 100, 'to': 200 }]
self.assertEqual(meta_result, expected_result)
def test_remove_duplicates_and_sort(self):
a1 = [1, 3, 5]
a2 = [8, 3, 6]
expected_result = [1, 3, 5, 6, 8]
utils_result = utils.remove_duplicates_and_sort(a1+a2)
self.assertEqual(utils_result, expected_result)
self.assertEqual([], [])
if __name__ == '__main__':
unittest.main()

43
tests/test_utils_dataframe.py

@ -0,0 +1,43 @@
import unittest
from utils import get_intersected_chunks, get_chunks
import pandas as pd
class TestUtils(unittest.TestCase):
def test_chunks_generator(self):
intersection = 2
chunk_size = 4
cases = [
(list(range(8)), [[0,1,2,3], [2,3,4,5], [4,5,6,7]]),
([], [[]]),
(list(range(1)), [[0]]),
(list(range(4)), [[0,1,2,3]]),
(list(range(9)), [[0,1,2,3], [2,3,4,5], [4,5,6,7], [6,7,8]])
]
for tested, expected in cases:
tested_chunks = get_intersected_chunks(tested, intersection, chunk_size)
self.assertSequenceEqual(tuple(tested_chunks), expected)
def test_non_intersected_chunks(self):
chunk_size = 4
cases = [
(tuple(range(12)), [[0,1,2,3], [4,5,6,7], [8,9,10,11]]),
(tuple(range(9)), [[0,1,2,3], [4,5,6,7], [8]]),
(tuple(range(10)), [[0,1,2,3], [4,5,6,7], [8,9]]),
(tuple(range(11)), [[0,1,2,3], [4,5,6,7], [8,9,10]]),
([], []),
(tuple(range(1)), [[0]]),
(tuple(range(4)), [[0,1,2,3]])
]
for tested, expected in cases:
tested_chunks = list(get_chunks(tested, chunk_size))
self.assertSequenceEqual(tested_chunks, expected)
if __name__ == '__main__':
unittest.main()

122
tools/analytic_model_tester.py

@ -0,0 +1,122 @@
import sys
ANALYTICS_PATH = '../analytics'
TESTS_PATH = '../tests'
sys.path.extend([ANALYTICS_PATH, TESTS_PATH])
import pandas as pd
import numpy as np
import utils
import test_dataset
from analytic_types.segment import Segment
from detectors import pattern_detector, threshold_detector, anomaly_detector
# TODO: get_dataset
# TODO: get_segment
PEAK_DATASETS = []
# dataset with 3 peaks
TEST_DATA = test_dataset.create_dataframe([0, 0, 3, 5, 7, 5, 3, 0, 0, 1, 0, 1, 4, 6, 8, 6, 4, 1, 0, 0, 0, 1, 0, 3, 5, 7, 5, 3, 0, 1, 1])
# TODO: more convenient way to specify labeled segments
POSITIVE_SEGMENTS = [{'from': 1523889000001, 'to': 1523889000007}, {'from': 1523889000022, 'to': 1523889000028}]
NEGATIVE_SEGMENTS = [{'from': 1523889000011, 'to': 1523889000017}]
class TesterSegment():
def __init__(self, start: int, end: int, labeled: bool):
self.start = start
self.end = end
self.labeled = labeled
def get_segment(self):
return {
'_id': 'q',
'analyticUnitId': 'q',
'from': self.start,
'to': self.end,
'labeled': self.labeled,
'deleted': not self.labeled
}
class Metric():
def __init__(self, expected_result, detector_result):
self.expected_result = expected_result
self.detector_result = detector_result['segments']
def get_amount(self):
return len(self.detector_result) / len(self.expected_result)
def get_accuracy(self):
correct_segment = 0
invalid_segment = 0
for segment in self.detector_result:
current_cs = correct_segment
for pattern in self.expected_result:
if pattern['from'] <= segment['from'] and pattern['to'] >= segment['to']:
correct_segment += 1
break
if correct_segment == current_cs:
invalid_segment += 1
non_detected = len(self.expected_result) - correct_segment
return (correct_segment, invalid_segment, non_detected)
class ModelData():
def __init__(self, frame: pd.DataFrame, positive_segments, negative_segments, model_type: str):
self.frame = frame
self.positive_segments = positive_segments
self.negative_segments = negative_segments
self.model_type = model_type
def get_segments_for_detection(self, positive_amount, negative_amount):
segments = []
for idx, bounds in enumerate(self.positive_segments):
if idx >= positive_amount:
break
segments.append(TesterSegment(bounds['from'], bounds['to'], True).get_segment())
for idx, bounds in enumerate(self.negative_segments):
if idx >= negative_amount:
break
segments.append(TesterSegment(bounds['from'], bounds['to'], False).get_segment())
return segments
def get_all_correct_segments(self):
return self.positive_segments
PEAK_DATA_1 = ModelData(TEST_DATA, POSITIVE_SEGMENTS, NEGATIVE_SEGMENTS, 'peak')
PEAK_DATASETS.append(PEAK_DATA_1)
def main(model_type: str) -> None:
table_metric = []
if model_type == 'peak':
for data in PEAK_DATASETS:
dataset = data.frame
segments = data.get_segments_for_detection(1, 0)
segments = [Segment.from_json(segment) for segment in segments]
detector = pattern_detector.PatternDetector('PEAK', 'test_id')
training_result = detector.train(dataset, segments, {})
cache = training_result['cache']
detect_result = detector.detect(dataset, cache)
detect_result = detect_result.to_json()
peak_metric = Metric(data.get_all_correct_segments(), detect_result)
table_metric.append((peak_metric.get_amount(), peak_metric.get_accuracy()))
return table_metric
if __name__ == '__main__':
'''
This tool applies the model on datasets and verifies that the detection result corresponds to the correct values.
sys.argv[1] expects one of the models name -> see correct_name
'''
# TODO: use enum
correct_name = ['peak', 'trough', 'jump', 'drop', 'general']
if len(sys.argv) < 2:
print('Enter one of models name: {}'.format(correct_name))
sys.exit(1)
model_type = str(sys.argv[1]).lower()
if model_type in correct_name:
print(main(model_type))
else:
print('Enter one of models name: {}'.format(correct_name))

104
tools/send_zmq_message.py

@ -0,0 +1,104 @@
import zmq
import zmq.asyncio
import asyncio
import json
from uuid import uuid4
context = zmq.asyncio.Context()
socket = context.socket(zmq.PAIR)
socket.connect('tcp://0.0.0.0:8002')
def create_message():
message = {
"method": "DATA",
"payload": {
"_id": uuid4().hex,
"analyticUnitId": uuid4().hex,
"type": "PUSH",
"payload": {
"data": [
[
1552652025000,
12.499999999999998
],
[
1552652040000,
12.500000000000002
],
[
1552652055000,
12.499999999999996
],
[
1552652070000,
12.500000000000002
],
[
1552652085000,
12.499999999999998
],
[
1552652100000,
12.5
],
[
1552652115000,
12.83261113785909
]
],
"from": 1552652025001,
"to": 1552652125541,
"analyticUnitType": "GENERAL",
"detector": "pattern",
"cache": {
"pattern_center": [
693
],
"pattern_model": [
1.7763568394002505e-15,
5.329070518200751e-15,
1.7763568394002505e-15,
1.7763568394002505e-15,
1.7763568394002505e-15,
3.552713678800501e-15,
1.7763568394002505e-15,
3.552713678800501e-15,
3.552713678800501e-15,
1.7763568394002505e-15,
1.7763568394002505e-15,
0,
1.7763568394002505e-15,
1.7763568394002505e-15,
0
],
"convolve_max": 7.573064690121713e-29,
"convolve_min": 7.573064690121713e-29,
"WINDOW_SIZE": 7,
"conv_del_min": 7,
"conv_del_max": 7
}
}
}
}
return json.dumps(message)
async def handle_loop():
while True:
received_bytes = await socket.recv()
text = received_bytes.decode('utf-8')
print(text)
async def send_detect():
data = create_message().encode('utf-8')
await socket.send(data)
if __name__ == "__main__":
loop = asyncio.get_event_loop()
socket.send(b'PING')
detects = [send_detect() for i in range(100)]
detects_group = asyncio.gather(*detects)
handle_group = asyncio.gather(handle_loop())
common_group = asyncio.gather(handle_group, detects_group)
loop.run_until_complete(common_group)
Loading…
Cancel
Save