Browse Source

detectors cleanup & jump_detector integration

pull/1/head
Alexey Velikiy 6 years ago
parent
commit
821da82025
  1. 2
      analytics/analytic_unit_worker.py
  2. 2
      analytics/detectors/__init__.py
  3. 2
      analytics/detectors/general_detector.py
  4. 164
      analytics/detectors/jump_detector.py
  5. 29
      analytics/detectors/pattern_detector.py
  6. 36
      analytics/detectors/peaks_detector.py
  7. 231
      analytics/detectors/step_detect.py
  8. 5
      analytics/detectors/step_detector.py
  9. 2
      analytics/supervised_algorithm.py

2
analytics/analytic_unit_worker.py

@ -81,6 +81,6 @@ class AnalyticUnitWorker(object):
if pattern_type == 'general': if pattern_type == 'general':
model = detectors.GeneralDetector(analytic_unit_id) model = detectors.GeneralDetector(analytic_unit_id)
else: else:
model = detectors.PatternDetectionModel(analytic_unit_id, pattern_type) model = detectors.PatternDetector(analytic_unit_id, pattern_type)
self.models_cache[analytic_unit_id] = model self.models_cache[analytic_unit_id] = model
return self.models_cache[analytic_unit_id] return self.models_cache[analytic_unit_id]

2
analytics/detectors/__init__.py

@ -1,5 +1,5 @@
from detectors.general_detector import GeneralDetector from detectors.general_detector import GeneralDetector
from detectors.pattern_detection_model import PatternDetectionModel from detectors.pattern_detector import PatternDetector
from detectors.peaks_detector import PeaksDetector from detectors.peaks_detector import PeaksDetector
from detectors.step_detector import StepDetector from detectors.step_detector import StepDetector
from detectors.jump_detector import Jumpdetector from detectors.jump_detector import Jumpdetector

2
analytics/detectors/general_detector.py

@ -75,7 +75,7 @@ class GeneralDetector:
) )
self.model = self.create_algorithm() self.model = self.create_algorithm()
self.model.fit(train_augmented, confidence) await self.model.fit(train_augmented, confidence)
if len(anomalies) > 0: if len(anomalies) > 0:
last_dataframe_time = dataframe.iloc[-1]['timestamp'] last_dataframe_time = dataframe.iloc[-1]['timestamp']
last_prediction_time = int(last_dataframe_time.timestamp() * 1000) last_prediction_time = int(last_dataframe_time.timestamp() * 1000)

164
analytics/detectors/jump_detector.py

@ -5,6 +5,7 @@ from scipy.fftpack import fft
from scipy.signal import argrelextrema from scipy.signal import argrelextrema
import math import math
def is_intersect(target_segment, segments): def is_intersect(target_segment, segments):
for segment in segments: for segment in segments:
start = max(segment['start'], target_segment[0]) start = max(segment['start'], target_segment[0])
@ -21,8 +22,7 @@ def exponential_smoothing(series, alpha):
class Jumpdetector: class Jumpdetector:
def __init__(self, pattern): def __init__(self):
self.pattern = pattern
self.segments = [] self.segments = []
self.confidence = 1.5 self.confidence = 1.5
self.convolve_max = 120 self.convolve_max = 120
@ -30,11 +30,11 @@ class Jumpdetector:
def intersection_segment(self, data, median): def intersection_segment(self, data, median):
cen_ind = [] cen_ind = []
for i in range(1, len(data)-1): for i in range(1, len(data)-1):
if data[i-1] < median and data[i+1] > median: if data[i - 1] < median and data[i + 1] > median:
cen_ind.append(i) cen_ind.append(i)
del_ind = [] del_ind = []
for i in range(1,len(cen_ind)): for i in range(1,len(cen_ind)):
if cen_ind[i] == cen_ind[i-1]+1: if cen_ind[i] == cen_ind[i - 1] + 1:
del_ind.append(i - 1) del_ind.append(i - 1)
del_ind = del_ind[::-1] del_ind = del_ind[::-1]
for i in del_ind: for i in del_ind:
@ -47,71 +47,75 @@ class Jumpdetector:
F = 1 * height / (1 + math.exp(-i * alpha)) F = 1 * height / (1 + math.exp(-i * alpha))
distribution.append(F) distribution.append(F)
return distribution return distribution
def alpha_finder(self, data, ):
# поиск альфы для логистической сигмоиды def alpha_finder(self, data):
"""
поиск альфы для логистической сигмоиды
def fit(self, dataframe, segments): """
data = dataframe['value'] pass
confidences = []
convolve_list = []
for segment in segments: async def fit(self, dataframe, segments):
if segment['labeled']: data = dataframe['value']
segment_data = data[segment['start'] : segment['finish'] + 1] confidences = []
segment_min = min(segment_data) convolve_list = []
segment_max = max(segment_data) for segment in segments:
confidences.append(0.20 * (segment_max - segment_min)) if segment['labeled']:
flat_segment = segment_data.rolling(window=4).mean() #сглаживаем сегмент segment_data = data[segment['start'] : segment['finish'] + 1]
kde_segment = flat_data.dropna().plot.kde() # distribution density segment_min = min(segment_data)
ax = flat_data.dropna().plot.kde() segment_max = max(segment_data)
ax_list = ax.get_lines()[0].get_xydata() confidences.append(0.20 * (segment_max - segment_min))
mids = argrelextrema(np.array(ax_list), np.less)[0] flat_segment = segment_data.rolling(window=4).mean() #сглаживаем сегмент
maxs = argrelextrema(np.array(ax_list), np.greater)[0] kde_segment = flat_data.dropna().plot.kde() # distribution density
min_peak = maxs[0] ax = flat_data.dropna().plot.kde()
max_peak = maxs[1] ax_list = ax.get_lines()[0].get_xydata()
min_line = ax_list[min_peak, 0] mids = argrelextrema(np.array(ax_list), np.less)[0]
max_line = ax_list[max_peak, 0] maxs = argrelextrema(np.array(ax_list), np.greater)[0]
sigm_heidht = max_line - min_line min_peak = maxs[0]
pat_sigm = logistic_sigmoid(-120, 120, 1, sigm_heidht) max_peak = maxs[1]
for i in range(0, len(pat_sigm)): min_line = ax_list[min_peak, 0]
pat_sigm[i] = pat_sigm[i] + min_line max_line = ax_list[max_peak, 0]
cen_ind = self.intersection_segment(flat_segment, mids[0]) sigm_heidht = max_line - min_line
c = [] pat_sigm = logistic_sigmoid(-120, 120, 1, sigm_heidht)
for i in range(len(cen_ind)): for i in range(0, len(pat_sigm)):
x = cen_ind[i] pat_sigm[i] = pat_sigm[i] + min_line
cx = scipy.signal.fftconvolve(pat_sigm, flat_data[x-120:x+120]) cen_ind = self.intersection_segment(flat_segment, mids[0])
c.append(cx[240]) c = []
for i in range(len(cen_ind)):
# в идеале нужно посмотреть гистограмму сегмента и выбрать среднее значение, x = cen_ind[i]
# далее от него брать + -120 cx = scipy.signal.fftconvolve(pat_sigm, flat_data[x-120:x+120])
segment_summ = 0 c.append(cx[240])
for val in flat_segment:
segment_summ += val # в идеале нужно посмотреть гистограмму сегмента и выбрать среднее значение,
segment_mid = segment_summ / len(flat_segment) #посчитать нормально среднее значение/медиану # далее от него брать + -120
for ind in range(1, len(flat_segment) - 1): segment_summ = 0
if flat_segment[ind + 1] > segment_mid and flat_segment[ind - 1] < segment_mid: for val in flat_segment:
flat_mid_index = ind # найти пересечение средней и графика, получить его индекс segment_summ += val
segment_mid_index = flat_mid_index - 5 segment_mid = segment_summ / len(flat_segment) #посчитать нормально среднее значение/медиану
labeled_drop = data[segment_mid_index - 120 : segment_mid_index + 120] for ind in range(1, len(flat_segment) - 1):
labeled_min = min(labeled_drop) if flat_segment[ind + 1] > segment_mid and flat_segment[ind - 1] < segment_mid:
for value in labeled_drop: # обрезаем flat_mid_index = ind # найти пересечение средней и графика, получить его индекс
value = value - labeled_min segment_mid_index = flat_mid_index - 5
labeled_max = max(labeled_drop) labeled_drop = data[segment_mid_index - 120 : segment_mid_index + 120]
for value in labeled_drop: # нормируем labeled_min = min(labeled_drop)
value = value / labeled_max for value in labeled_drop: # обрезаем
convolve = scipy.signal.fftconvolve(labeled_drop, labeled_drop) value = value - labeled_min
convolve_list.append(max(convolve)) # сворачиваем паттерн labeled_max = max(labeled_drop)
# плюс надо впихнуть сюда логистическую сигмоиду и поиск альфы for value in labeled_drop: # нормируем
value = value / labeled_max
if len(confidences) > 0: convolve = scipy.signal.fftconvolve(labeled_drop, labeled_drop)
self.confidence = min(confidences) convolve_list.append(max(convolve)) # сворачиваем паттерн
else: # плюс надо впихнуть сюда логистическую сигмоиду и поиск альфы
self.confidence = 1.5
if len(confidences) > 0:
if len(convolve_list) > 0: self.confidence = min(confidences)
self.convolve_max = max(convolve_list) else:
else: self.confidence = 1.5
self.convolve_max = 120 # макс метрика свертки равна отступу(120), вау!
if len(convolve_list) > 0:
self.convolve_max = max(convolve_list)
else:
self.convolve_max = 120 # макс метрика свертки равна отступу(120), вау!
def logistic_sigmoid(x1, x2, alpha, height): def logistic_sigmoid(x1, x2, alpha, height):
distribution = [] distribution = []
@ -131,20 +135,20 @@ class Jumpdetector:
return result return result
def __predict(self, data): def __predict(self, data):
window_size = 24 window_size = 24
all_max_flatten_data = data.rolling(window=window_size).mean() all_max_flatten_data = data.rolling(window=window_size).mean()
extrema_list = [] extrema_list = []
# добавить все пересечения экспоненты со сглаженным графиком # добавить все пересечения экспоненты со сглаженным графиком
for i in exponential_smoothing(data + self.confidence, 0.02): for i in exponential_smoothing(data + self.confidence, 0.02):
extrema_list.append(i) extrema_list.append(i)
segments = [] segments = []
for i in all_mins: for i in all_mins:
if all_max_flatten_data[i] > extrema_list[i]: if all_max_flatten_data[i] > extrema_list[i]:
segments.append(i - window_size) segments.append(i - window_size)
return [(x - 1, x + 1) for x in self.__filter_prediction(segments, all_max_flatten_data)] return [(x - 1, x + 1) for x in self.__filter_prediction(segments, all_max_flatten_data)]
def __filter_prediction(self, segments, all_max_flatten_data): def __filter_prediction(self, segments, all_max_flatten_data):
delete_list = [] delete_list = []

29
analytics/detectors/pattern_detection_model.py → analytics/detectors/pattern_detector.py

@ -1,5 +1,4 @@
from detectors.step_detector import StepDetector import detectors
from detectors.peaks_detector import PeaksDetector
from grafana_data_provider import GrafanaDataProvider from grafana_data_provider import GrafanaDataProvider
@ -25,8 +24,17 @@ def segments_box(segments):
max_time = pd.to_datetime(max_time, unit='ms') max_time = pd.to_datetime(max_time, unit='ms')
return min_time, max_time return min_time, max_time
def resolve_detector_by_pattern(pattern):
if pattern == "peak":
return detectors.PeaksDetector()
if pattern == "drop":
return detectors.StepDetector()
if pattern == "jump":
return detectors.Jumpdetector()
raise ValueError('Unknown pattern "%s"' % pattern)
class PatternDetectionModel:
class PatternDetector:
def __init__(self, analytic_unit_id, pattern_type): def __init__(self, analytic_unit_id, pattern_type):
self.analytic_unit_id = analytic_unit_id self.analytic_unit_id = analytic_unit_id
@ -53,14 +61,14 @@ class PatternDetectionModel:
self.__load_model(pattern_type) self.__load_model(pattern_type)
async def learn(self, segments): async def learn(self, segments):
self.model = self.__create_model(self.pattern_type) self.model = resolve_detector_by_pattern(self.pattern_type)
window_size = 200 window_size = 200
dataframe = self.data_prov.get_dataframe() dataframe = self.data_prov.get_dataframe()
segments = self.data_prov.transform_anomalies(segments) segments = self.data_prov.transform_anomalies(segments)
# TODO: pass only part of dataframe that has segments # TODO: pass only part of dataframe that has segments
self.model.fit(dataframe, segments) await self.model.fit(dataframe, segments)
self.__save_model() self.__save_model()
return 0 return 0
@ -88,7 +96,7 @@ class PatternDetectionModel:
'finish': max(ts1, ts2) 'finish': max(ts1, ts2)
}) })
last_dataframe_time = dataframe.iloc[- 1]['timestamp'] last_dataframe_time = dataframe.iloc[-1]['timestamp']
last_prediction_time = int(last_dataframe_time.timestamp() * 1000) last_prediction_time = int(last_dataframe_time.timestamp() * 1000)
return segments, last_prediction_time return segments, last_prediction_time
# return predicted_anomalies, last_prediction_time # return predicted_anomalies, last_prediction_time
@ -96,13 +104,6 @@ class PatternDetectionModel:
def synchronize_data(self): def synchronize_data(self):
self.data_prov.synchronize() self.data_prov.synchronize()
def __create_model(self, pattern):
if pattern == "peak":
return PeaksDetector()
if pattern == "jump" or pattern == "drop":
return StepDetector(pattern)
raise ValueError('Unknown pattern "%s"' % pattern)
def __load_anomaly_config(self): def __load_anomaly_config(self):
with open(os.path.join(config.ANALYTIC_UNITS_FOLDER, self.analytic_unit_id + ".json"), 'r') as config_file: with open(os.path.join(config.ANALYTIC_UNITS_FOLDER, self.analytic_unit_id + ".json"), 'r') as config_file:
self.anomaly_config = json.load(config_file) self.anomaly_config = json.load(config_file)
@ -116,5 +117,5 @@ class PatternDetectionModel:
logger.info("Load model '%s'" % self.analytic_unit_id) logger.info("Load model '%s'" % self.analytic_unit_id)
model_filename = os.path.join(config.MODELS_FOLDER, self.pattern_type + ".m") model_filename = os.path.join(config.MODELS_FOLDER, self.pattern_type + ".m")
if os.path.exists(model_filename): if os.path.exists(model_filename):
self.model = self.__create_model(pattern) self.model = resolve_detector_by_pattern(pattern)
self.model.load(model_filename) self.model.load(model_filename)

36
analytics/detectors/peaks_detector.py

@ -1,14 +1,42 @@
import detectors.step_detect
from scipy import signal from scipy import signal
import numpy as np import numpy as np
def find_steps(array, threshold):
"""
Finds local maxima by segmenting array based on positions at which
the threshold value is crossed. Note that this thresholding is
applied after the absolute value of the array is taken. Thus,
the distinction between upward and downward steps is lost. However,
get_step_sizes can be used to determine directionality after the
fact.
Parameters
----------
array : numpy array
1 dimensional array that represents time series of data points
threshold : int / float
Threshold value that defines a step
Returns
-------
steps : list
List of indices of the detected steps
"""
steps = []
array = np.abs(array)
above_points = np.where(array > threshold, 1, 0)
ap_dif = np.diff(above_points)
cross_ups = np.where(ap_dif == 1)[0]
cross_dns = np.where(ap_dif == -1)[0]
for upi, dni in zip(cross_ups,cross_dns):
steps.append(np.argmax(array[upi:dni]) + upi)
return steps
class PeaksDetector: class PeaksDetector:
def __init__(self): def __init__(self):
pass pass
def fit(self, dataset, contamination=0.005): async def fit(self, dataset, contamination=0.005):
pass pass
async def predict(self, dataframe): async def predict(self, dataframe):
@ -52,7 +80,7 @@ class PeaksDetector:
data = filtered data = filtered
data /= data.max() data /= data.max()
result = step_detect.find_steps(data, 0.1) result = find_steps(data, 0.1)
return [(dataframe.index[x], dataframe.index[x + window_size]) for x in result] return [(dataframe.index[x], dataframe.index[x + window_size]) for x in result]
def save(self, model_filename): def save(self, model_filename):

231
analytics/detectors/step_detect.py

@ -1,231 +0,0 @@
"""
Thomas Kahn
thomas.b.kahn@gmail.com
"""
from __future__ import absolute_import
from math import sqrt
import multiprocessing as mp
import numpy as np
from six.moves import range
from six.moves import zip
def t_scan(L, window = 1e3, num_workers = -1):
"""
Computes t statistic for i to i+window points versus i-window to i
points for each point i in input array. Uses multiple processes to
do this calculation asynchronously. Array is decomposed into window
number of frames, each consisting of points spaced at window
intervals. This optimizes the calculation, as the drone function
need only compute the mean and variance for each set once.
Parameters
----------
L : numpy array
1 dimensional array that represents time series of datapoints
window : int / float
Number of points that comprise the windows of data that are
compared
num_workers : int
Number of worker processes for multithreaded t_stat computation
Defult value uses num_cpu - 1 workers
Returns
-------
t_stat : numpy array
Array which holds t statistic values for each point. The first
and last (window) points are replaced with zero, since the t
statistic calculation cannot be performed in that case.
"""
size = L.size
window = int(window)
frames = list(range(window))
n_cols = (size // window) - 1
t_stat = np.zeros((window, n_cols))
if num_workers == 1:
results = [_t_scan_drone(L, n_cols, frame, window) for frame in frames]
else:
if num_workers == -1:
num_workers = mp.cpu_count() - 1
pool = mp.Pool(processes = num_workers)
results = [pool.apply_async(_t_scan_drone, args=(L, n_cols, frame, window)) for frame in frames]
results = [r.get() for r in results]
pool.close()
for index, row in results:
t_stat[index] = row
t_stat = np.concatenate((
np.zeros(window),
t_stat.transpose().ravel(order='C'),
np.zeros(size % window)
))
return t_stat
def _t_scan_drone(L, n_cols, frame, window=1e3):
"""
Drone function for t_scan. Not Intended to be called manually.
Computes t_scan for the designated frame, and returns result as
array along with an integer tag for proper placement in the
aggregate array
"""
size = L.size
window = int(window)
root_n = sqrt(window)
output = np.zeros(n_cols)
b = L[frame:window+frame]
b_mean = b.mean()
b_var = b.var()
for i in range(window+frame, size-window, window):
a = L[i:i+window]
a_mean = a.mean()
a_var = a.var()
output[i // window - 1] = root_n * (a_mean - b_mean) / sqrt(a_var + b_var)
b_mean, b_var = a_mean, a_var
return frame, output
def mz_fwt(x, n=2):
"""
Computes the multiscale product of the Mallat-Zhong discrete forward
wavelet transform up to and including scale n for the input data x.
If n is even, the spikes in the signal will be positive. If n is odd
the spikes will match the polarity of the step (positive for steps
up, negative for steps down).
This function is essentially a direct translation of the MATLAB code
provided by Sadler and Swami in section A.4 of the following:
http://www.dtic.mil/dtic/tr/fulltext/u2/a351960.pdf
Parameters
----------
x : numpy array
1 dimensional array that represents time series of data points
n : int
Highest scale to multiply to
Returns
-------
prod : numpy array
The multiscale product for x
"""
N_pnts = x.size
lambda_j = [1.5, 1.12, 1.03, 1.01][0:n]
if n > 4:
lambda_j += [1.0]*(n-4)
H = np.array([0.125, 0.375, 0.375, 0.125])
G = np.array([2.0, -2.0])
Gn = [2]
Hn = [3]
for j in range(1,n):
q = 2**(j-1)
Gn.append(q+1)
Hn.append(3*q+1)
S = np.concatenate((x[::-1], x))
S = np.concatenate((S, x[::-1]))
prod = np.ones(N_pnts)
for j in range(n):
n_zeros = 2**j - 1
Gz = _insert_zeros(G, n_zeros)
Hz = _insert_zeros(H, n_zeros)
current = (1.0/lambda_j[j])*np.convolve(S,Gz)
current = current[N_pnts+Gn[j]:2*N_pnts+Gn[j]]
prod *= current
if j == n-1:
break
S_new = np.convolve(S, Hz)
S_new = S_new[N_pnts+Hn[j]:2*N_pnts+Hn[j]]
S = np.concatenate((S_new[::-1], S_new))
S = np.concatenate((S, S_new[::-1]))
return prod
def _insert_zeros(x, n):
"""
Helper function for mz_fwt. Splits input array and adds n zeros
between values.
"""
newlen = (n+1)*x.size
out = np.zeros(newlen)
indices = list(range(0, newlen-n, n+1))
out[indices] = x
return out
def find_steps(array, threshold):
"""
Finds local maxima by segmenting array based on positions at which
the threshold value is crossed. Note that this thresholding is
applied after the absolute value of the array is taken. Thus,
the distinction between upward and downward steps is lost. However,
get_step_sizes can be used to determine directionality after the
fact.
Parameters
----------
array : numpy array
1 dimensional array that represents time series of data points
threshold : int / float
Threshold value that defines a step
Returns
-------
steps : list
List of indices of the detected steps
"""
steps = []
array = np.abs(array)
above_points = np.where(array > threshold, 1, 0)
ap_dif = np.diff(above_points)
cross_ups = np.where(ap_dif == 1)[0]
cross_dns = np.where(ap_dif == -1)[0]
for upi, dni in zip(cross_ups,cross_dns):
steps.append(np.argmax(array[upi:dni]) + upi)
return steps
def get_step_sizes(array, indices, window=1000):
"""
Calculates step size for each index within the supplied list. Step
size is determined by averaging over a range of points (specified
by the window parameter) before and after the index of step
occurrence. The directionality of the step is reflected by the sign
of the step size (i.e. a positive value indicates an upward step,
and a negative value indicates a downward step). The combined
standard deviation of both measurements (as a measure of uncertainty
in step calculation) is also provided.
Parameters
----------
array : numpy array
1 dimensional array that represents time series of data points
indices : list
List of indices of the detected steps (as provided by
find_steps, for example)
window : int, optional
Number of points to average over to determine baseline levels
before and after step.
Returns
-------
step_sizes : list
List of the calculated sizes of each step
step_error : list
"""
step_sizes = []
step_error = []
indices = sorted(indices)
last = len(indices) - 1
for i, index in enumerate(indices):
if i == 0:
q = min(window, indices[i+1]-index)
elif i == last:
q = min(window, index - indices[i-1])
else:
q = min(window, index-indices[i-1], indices[i+1]-index)
a = array[index:index+q]
b = array[index-q:index]
step_sizes.append(a.mean() - b.mean())
step_error.append(sqrt(a.var()+b.var()))
return step_sizes, step_error

5
analytics/detectors/step_detector.py

@ -23,13 +23,12 @@ def exponential_smoothing(series, alpha):
class StepDetector: class StepDetector:
def __init__(self, pattern): def __init__(self):
self.pattern = pattern
self.segments = [] self.segments = []
self.confidence = 1.5 self.confidence = 1.5
self.convolve_max = 570000 self.convolve_max = 570000
def fit(self, dataframe, segments): async def fit(self, dataframe, segments):
data = dataframe['value'] data = dataframe['value']
confidences = [] confidences = []
convolve_list = [] convolve_list = []

2
analytics/supervised_algorithm.py

@ -31,7 +31,7 @@ class supervised_algorithm(object):
self.col_to_max, self.col_to_min, self.col_to_median = None, None, None self.col_to_max, self.col_to_min, self.col_to_median = None, None, None
self.augmented_path = None self.augmented_path = None
def fit(self, dataset, contamination=0.005): async def fit(self, dataset, contamination=0.005):
dataset = dataset[self.good_features] dataset = dataset[self.good_features]
dataset = dataset[-100000:] dataset = dataset[-100000:]

Loading…
Cancel
Save