detectors cleanup & jump_detector integration

7 years ago · 821da82025
9 changed files with 139 additions and 338 deletions
--- a/analytics/analytic_unit_worker.py
+++ b/analytics/analytic_unit_worker.py
@ -81,6 +81,6 @@ class AnalyticUnitWorker(object):
            if pattern_type == 'general':
                model = detectors.GeneralDetector(analytic_unit_id)
            else:
-                model = detectors.PatternDetectionModel(analytic_unit_id, pattern_type)
+                model = detectors.PatternDetector(analytic_unit_id, pattern_type)
            self.models_cache[analytic_unit_id] = model
        return self.models_cache[analytic_unit_id]
--- a/analytics/detectors/init.py
+++ b/analytics/detectors/init.py
@ -1,5 +1,5 @@
 from detectors.general_detector import GeneralDetector
-from detectors.pattern_detection_model import PatternDetectionModel
+from detectors.pattern_detector import PatternDetector
 from detectors.peaks_detector import PeaksDetector
 from detectors.step_detector import StepDetector
 from detectors.jump_detector import Jumpdetector
--- a/analytics/detectors/general_detector.py
+++ b/analytics/detectors/general_detector.py
@ -75,7 +75,7 @@ class GeneralDetector:
        )

        self.model = self.create_algorithm()
-        self.model.fit(train_augmented, confidence)
+        await self.model.fit(train_augmented, confidence)
        if len(anomalies) > 0:
            last_dataframe_time = dataframe.iloc[-1]['timestamp']
            last_prediction_time = int(last_dataframe_time.timestamp() * 1000)
--- a/analytics/detectors/jump_detector.py
+++ b/analytics/detectors/jump_detector.py
@ -5,6 +5,7 @@ from scipy.fftpack import fft
 from scipy.signal import argrelextrema
 import math

+
 def is_intersect(target_segment, segments):
    for segment in segments:
        start = max(segment['start'], target_segment[0])
@ -21,8 +22,7 @@ def exponential_smoothing(series, alpha):

 class Jumpdetector:

-    def __init__(self, pattern):
-        self.pattern = pattern
+    def __init__(self):
        self.segments = []
        self.confidence = 1.5
        self.convolve_max = 120
@ -47,11 +47,15 @@ class Jumpdetector:
            F = 1 * height / (1 + math.exp(-i * alpha))
            distribution.append(F)
        return distribution
-    def alpha_finder(self, data, ):
-        # поиск альфы для логистической сигмоиды
+    
+    def alpha_finder(self, data):
+        """
+        поиск альфы для логистической сигмоиды
+        """
+        pass


-    def fit(self, dataframe, segments):
+    async def fit(self, dataframe, segments):
        data = dataframe['value']
        confidences = []
        convolve_list = []
--- a/analytics/detectors/pattern_detection_model.py
+++ b/analytics/detectors/pattern_detection_model.py
@ -1,5 +1,4 @@
-from detectors.step_detector import StepDetector
-from detectors.peaks_detector import PeaksDetector
+import detectors

 from grafana_data_provider import GrafanaDataProvider

@ -25,8 +24,17 @@ def segments_box(segments):
    max_time = pd.to_datetime(max_time, unit='ms')
    return min_time, max_time

+def resolve_detector_by_pattern(pattern):
+    if pattern == "peak":
+        return detectors.PeaksDetector()
+    if pattern == "drop":
+        return detectors.StepDetector()
+    if pattern == "jump":
+        return detectors.Jumpdetector()
+    raise ValueError('Unknown pattern "%s"' % pattern)
+

-class PatternDetectionModel:
+class PatternDetector:

    def __init__(self, analytic_unit_id, pattern_type):
        self.analytic_unit_id = analytic_unit_id
@ -53,14 +61,14 @@ class PatternDetectionModel:
        self.__load_model(pattern_type)

    async def learn(self, segments):
-        self.model = self.__create_model(self.pattern_type)
+        self.model = resolve_detector_by_pattern(self.pattern_type)
        window_size = 200

        dataframe = self.data_prov.get_dataframe()

        segments = self.data_prov.transform_anomalies(segments)
        # TODO: pass only part of dataframe that has segments
-        self.model.fit(dataframe, segments)
+        await self.model.fit(dataframe, segments)
        self.__save_model()
        return 0

@ -96,13 +104,6 @@ class PatternDetectionModel:
    def synchronize_data(self):
        self.data_prov.synchronize()

-    def __create_model(self, pattern):
-        if pattern == "peak":
-            return PeaksDetector()
-        if pattern == "jump" or pattern == "drop":
-            return StepDetector(pattern)
-        raise ValueError('Unknown pattern "%s"' % pattern)
-
    def __load_anomaly_config(self):
        with open(os.path.join(config.ANALYTIC_UNITS_FOLDER, self.analytic_unit_id + ".json"), 'r') as config_file:
            self.anomaly_config = json.load(config_file)
@ -116,5 +117,5 @@ class PatternDetectionModel:
        logger.info("Load model '%s'" % self.analytic_unit_id)
        model_filename = os.path.join(config.MODELS_FOLDER, self.pattern_type + ".m")
        if os.path.exists(model_filename):
-            self.model = self.__create_model(pattern)
+            self.model = resolve_detector_by_pattern(pattern)
            self.model.load(model_filename)
--- a/analytics/detectors/peaks_detector.py
+++ b/analytics/detectors/peaks_detector.py
@ -1,14 +1,42 @@
-import detectors.step_detect
-
 from scipy import signal
 import numpy as np


+def find_steps(array, threshold):
+    """
+    Finds local maxima by segmenting array based on positions at which
+    the threshold value is crossed. Note that this thresholding is
+    applied after the absolute value of the array is taken. Thus,
+    the distinction between upward and downward steps is lost. However,
+    get_step_sizes can be used to determine directionality after the
+    fact.
+    Parameters
+    ----------
+    array : numpy array
+        1 dimensional array that represents time series of data points
+    threshold : int / float
+        Threshold value that defines a step
+    Returns
+    -------
+    steps : list
+        List of indices of the detected steps
+    """
+    steps        = []
+    array        = np.abs(array)
+    above_points = np.where(array > threshold, 1, 0)
+    ap_dif       = np.diff(above_points)
+    cross_ups    = np.where(ap_dif == 1)[0]
+    cross_dns    = np.where(ap_dif == -1)[0]
+    for upi, dni in zip(cross_ups,cross_dns):
+        steps.append(np.argmax(array[upi:dni]) + upi)
+    return steps
+
+
 class PeaksDetector:
    def __init__(self):
        pass

-    def fit(self, dataset, contamination=0.005):
+    async def fit(self, dataset, contamination=0.005):
        pass

    async def predict(self, dataframe):
@ -52,7 +80,7 @@ class PeaksDetector:
        data = filtered
        data /= data.max()

-        result = step_detect.find_steps(data, 0.1)
+        result = find_steps(data, 0.1)
        return [(dataframe.index[x], dataframe.index[x + window_size]) for x in result]

    def save(self, model_filename):
--- a/analytics/detectors/step_detect.py
+++ b/analytics/detectors/step_detect.py
@ -1,231 +0,0 @@
-
-"""
-Thomas Kahn
-thomas.b.kahn@gmail.com
-"""
-from __future__ import absolute_import
-from math import sqrt
-import multiprocessing as mp
-import numpy as np
-from six.moves import range
-from six.moves import zip
-
-
-def t_scan(L, window = 1e3, num_workers = -1):
-    """
-    Computes t statistic for i to i+window points versus i-window to i
-    points for each point i in input array. Uses multiple processes to
-    do this calculation asynchronously. Array is decomposed into window
-    number of frames, each consisting of points spaced at window
-    intervals. This optimizes the calculation, as the drone function
-    need only compute the mean and variance for each set once.
-    Parameters
-    ----------
-    L : numpy array
-        1 dimensional array that represents time series of datapoints
-    window : int / float
-        Number of points that comprise the windows of data that are
-        compared
-    num_workers : int
-        Number of worker processes for multithreaded t_stat computation
-        Defult value uses num_cpu - 1 workers
-    Returns
-    -------
-    t_stat : numpy array
-        Array which holds t statistic values for each point. The first
-        and last (window) points are replaced with zero, since the t
-        statistic calculation cannot be performed in that case.
-    """
-    size    = L.size
-    window  = int(window)
-    frames  = list(range(window))
-    n_cols  = (size // window) - 1
-
-    t_stat  = np.zeros((window, n_cols))
-
-    if num_workers == 1:
-        results = [_t_scan_drone(L, n_cols, frame, window) for frame in frames]
-    else:
-        if num_workers == -1:
-            num_workers = mp.cpu_count() - 1
-        pool    = mp.Pool(processes = num_workers)
-        results = [pool.apply_async(_t_scan_drone, args=(L, n_cols, frame, window)) for frame in frames]
-        results = [r.get() for r in results]
-        pool.close()
-
-    for index, row in results:
-        t_stat[index] = row
-
-    t_stat  = np.concatenate((
-        np.zeros(window),
-        t_stat.transpose().ravel(order='C'),
-        np.zeros(size % window)
-    ))
-
-    return t_stat
-
-
-def _t_scan_drone(L, n_cols, frame, window=1e3):
-    """
-    Drone function for t_scan. Not Intended to be called manually.
-    Computes t_scan for the designated frame, and returns result as
-    array along with an integer tag for proper placement in the
-    aggregate array
-    """
-    size   = L.size
-    window = int(window)
-    root_n = sqrt(window)
-
-    output = np.zeros(n_cols)
-    b      = L[frame:window+frame]
-    b_mean = b.mean()
-    b_var  = b.var()
-    for i in range(window+frame, size-window, window):
-        a = L[i:i+window]
-        a_mean = a.mean()
-        a_var  = a.var()
-        output[i // window - 1] = root_n * (a_mean - b_mean) / sqrt(a_var + b_var)
-        b_mean, b_var = a_mean, a_var
-
-    return frame, output
-
-
-def mz_fwt(x, n=2):
-    """
-    Computes the multiscale product of the Mallat-Zhong discrete forward
-    wavelet transform up to and including scale n for the input data x.
-    If n is even, the spikes in the signal will be positive. If n is odd
-    the spikes will match the polarity of the step (positive for steps
-    up, negative for steps down).
-    This function is essentially a direct translation of the MATLAB code
-    provided by Sadler and Swami in section A.4 of the following:
-    http://www.dtic.mil/dtic/tr/fulltext/u2/a351960.pdf
-    Parameters
-    ----------
-    x : numpy array
-        1 dimensional array that represents time series of data points
-    n : int
-        Highest scale to multiply to
-    Returns
-    -------
-    prod : numpy array
-        The multiscale product for x
-    """
-    N_pnts   = x.size
-    lambda_j = [1.5, 1.12, 1.03, 1.01][0:n]
-    if n > 4:
-        lambda_j += [1.0]*(n-4)
-
-    H = np.array([0.125, 0.375, 0.375, 0.125])
-    G = np.array([2.0, -2.0])
-
-    Gn = [2]
-    Hn = [3]
-    for j in range(1,n):
-        q = 2**(j-1)
-        Gn.append(q+1)
-        Hn.append(3*q+1)
-
-    S    = np.concatenate((x[::-1], x))
-    S    = np.concatenate((S, x[::-1]))
-    prod = np.ones(N_pnts)
-    for j in range(n):
-        n_zeros = 2**j - 1
-        Gz      = _insert_zeros(G, n_zeros)
-        Hz      = _insert_zeros(H, n_zeros)
-        current = (1.0/lambda_j[j])*np.convolve(S,Gz)
-        current = current[N_pnts+Gn[j]:2*N_pnts+Gn[j]]
-        prod    *= current
-        if j == n-1:
-            break
-        S_new   = np.convolve(S, Hz)
-        S_new   = S_new[N_pnts+Hn[j]:2*N_pnts+Hn[j]]
-        S       = np.concatenate((S_new[::-1], S_new))
-        S       = np.concatenate((S, S_new[::-1]))
-    return prod
-
-
-def _insert_zeros(x, n):
-    """
-    Helper function for mz_fwt. Splits input array and adds n zeros
-    between values.
-    """
-    newlen       = (n+1)*x.size
-    out          = np.zeros(newlen)
-    indices      = list(range(0, newlen-n, n+1))
-    out[indices] = x
-    return out
-
-
-def find_steps(array, threshold):
-    """
-    Finds local maxima by segmenting array based on positions at which
-    the threshold value is crossed. Note that this thresholding is
-    applied after the absolute value of the array is taken. Thus,
-    the distinction between upward and downward steps is lost. However,
-    get_step_sizes can be used to determine directionality after the
-    fact.
-    Parameters
-    ----------
-    array : numpy array
-        1 dimensional array that represents time series of data points
-    threshold : int / float
-        Threshold value that defines a step
-    Returns
-    -------
-    steps : list
-        List of indices of the detected steps
-    """
-    steps        = []
-    array        = np.abs(array)
-    above_points = np.where(array > threshold, 1, 0)
-    ap_dif       = np.diff(above_points)
-    cross_ups    = np.where(ap_dif == 1)[0]
-    cross_dns    = np.where(ap_dif == -1)[0]
-    for upi, dni in zip(cross_ups,cross_dns):
-        steps.append(np.argmax(array[upi:dni]) + upi)
-    return steps
-
-
-def get_step_sizes(array, indices, window=1000):
-    """
-    Calculates step size for each index within the supplied list. Step
-    size is determined by averaging over a range of points (specified
-    by the window parameter) before and after the index of step
-    occurrence. The directionality of the step is reflected by the sign
-    of the step size (i.e. a positive value indicates an upward step,
-    and a negative value indicates a downward step). The combined
-    standard deviation of both measurements (as a measure of uncertainty
-    in step calculation) is also provided.
-    Parameters
-    ----------
-    array : numpy array
-        1 dimensional array that represents time series of data points
-    indices : list
-        List of indices of the detected steps (as provided by
-        find_steps, for example)
-    window : int, optional
-        Number of points to average over to determine baseline levels
-        before and after step.
-    Returns
-    -------
-    step_sizes : list
-        List of the calculated sizes of each step
-    step_error : list
-    """
-    step_sizes = []
-    step_error = []
-    indices    = sorted(indices)
-    last       = len(indices) - 1
-    for i, index in enumerate(indices):
-        if i == 0:
-            q = min(window, indices[i+1]-index)
-        elif i == last:
-            q = min(window, index - indices[i-1])
-        else:
-            q = min(window, index-indices[i-1], indices[i+1]-index)
-        a = array[index:index+q]
-        b = array[index-q:index]
-        step_sizes.append(a.mean() - b.mean())
-        step_error.append(sqrt(a.var()+b.var()))
-    return step_sizes, step_error
--- a/analytics/detectors/step_detector.py
+++ b/analytics/detectors/step_detector.py
@ -23,13 +23,12 @@ def exponential_smoothing(series, alpha):

 class StepDetector:

-    def __init__(self, pattern):
-        self.pattern = pattern
+    def __init__(self):
        self.segments = []
        self.confidence = 1.5
        self.convolve_max = 570000

-    def fit(self, dataframe, segments):
+    async def fit(self, dataframe, segments):
        data = dataframe['value']
        confidences = []
        convolve_list = []
--- a/analytics/supervised_algorithm.py
+++ b/analytics/supervised_algorithm.py
@ -31,7 +31,7 @@ class supervised_algorithm(object):
        self.col_to_max, self.col_to_min, self.col_to_median = None, None, None
        self.augmented_path = None

-    def fit(self, dataset, contamination=0.005):
+    async def fit(self, dataset, contamination=0.005):
        dataset = dataset[self.good_features]
        dataset = dataset[-100000:]