hastic-server/analytics/anomaly_model.py

import os.path
from data_provider import DataProvider
from data_preprocessor import data_preprocessor
import json
import pandas as pd
import logging
from urllib.parse import urlparse
import config


logger = logging.getLogger('analytic_toolset')


def anomalies_to_timestamp(anomalies):
    for anomaly in anomalies:
        anomaly['start'] = int(anomaly['start'].timestamp() * 1000)
        anomaly['finish'] = int(anomaly['finish'].timestamp() * 1000)
    return anomalies


class AnomalyModel:

    def __init__(self, anomaly_name):
        self.anomaly_name = anomaly_name
        self.load_anomaly_config()

        parsedUrl = urlparse(self.anomaly_config['panelUrl'])
        origin = parsedUrl.scheme + '://' + parsedUrl.netloc

        datasource = self.anomaly_config['datasource']
        datasource['origin'] = origin
        metric_name = self.anomaly_config['metric']['targets'][0]

        target_filename = os.path.join(config.METRICS_FOLDER, metric_name + ".json")

        dataset_filename = os.path.join(config.DATASET_FOLDER, metric_name + ".csv")
        augmented_path = os.path.join(config.DATASET_FOLDER, metric_name + "_augmented.csv")

        with open(target_filename, 'r') as file:
            target = json.load(file)

        self.data_prov = DataProvider(datasource, target, dataset_filename)
        self.preprocessor = data_preprocessor(self.data_prov, augmented_path)
        self.model = None

        self.__load_model()

    def anomalies_box(self, anomalies):
        max_time = 0
        min_time = float("inf")
        for anomaly in anomalies:
            max_time = max(max_time, anomaly['finish'])
            min_time = min(min_time, anomaly['start'])
        min_time = pd.to_datetime(min_time, unit='ms')
        max_time = pd.to_datetime(max_time, unit='ms')
        return min_time, max_time

    def learn(self, anomalies):
        logger.info("Start to learn for anomaly_name='%s'" % self.anomaly_name)

        confidence = 0.02
        dataframe = self.data_prov.get_dataframe()
        start_index, stop_index = 0, len(dataframe)
        if len(anomalies) > 0:
            confidence = 0.0
            min_time, max_time = self.anomalies_box(anomalies)
            dataframe = dataframe[dataframe['timestamp'] <= max_time]
            dataframe = dataframe[dataframe['timestamp'] >= min_time]

        train_augmented = self.preprocessor.get_augmented_data(
            dataframe.index[0],
            dataframe.index[-1],
            anomalies
        )

        self.model = self.create_algorithm()
        self.model.fit(train_augmented, confidence)
        if len(anomalies) > 0:
            last_dataframe_time = dataframe.iloc[-1]['timestamp']
            last_prediction_time = int(last_dataframe_time.timestamp() * 1000)
        else:
            last_prediction_time = 0

        self.__save_model()
        logger.info("Learning is finished for anomaly_name='%s'" % self.anomaly_name)
        return last_prediction_time

    def predict(self, last_prediction_time):
        logger.info("Start to predict for anomaly type='%s'" % self.anomaly_name)
        last_prediction_time = pd.to_datetime(last_prediction_time, unit='ms')

        start_index = self.data_prov.get_upper_bound(last_prediction_time)
        stop_index = self.data_prov.size()

        last_prediction_time = int(last_prediction_time.timestamp() * 1000)

        predicted_anomalies = []
        if start_index < stop_index:
            max_chunk_size = 50000
            predicted = pd.Series()
            for index in range(start_index, stop_index, max_chunk_size):
                chunk_start = index
                chunk_finish = min(index + max_chunk_size, stop_index)
                predict_augmented = self.preprocessor.get_augmented_data(chunk_start, chunk_finish)

                assert(len(predict_augmented) == chunk_finish - chunk_start)

                predicted_current = self.model.predict(predict_augmented)
                predicted = pd.concat([predicted, predicted_current])
            predicted_anomalies = self.preprocessor.inverse_transform_anomalies(predicted)

            last_row = self.data_prov.get_data_range(stop_index - 1, stop_index)

            last_dataframe_time = last_row.iloc[0]['timestamp']
            predicted_anomalies = anomalies_to_timestamp(predicted_anomalies)
            last_prediction_time = int(last_dataframe_time.timestamp() * 1000)

        logger.info("Predicting is finished for anomaly type='%s'" % self.anomaly_name)
        return predicted_anomalies, last_prediction_time

    def synchronize_data(self):
        self.data_prov.synchronize()
        self.preprocessor.set_data_provider(self.data_prov)
        self.preprocessor.synchronize()

    def load_anomaly_config(self):
        with open(os.path.join(config.ANOMALIES_FOLDER, self.anomaly_name + ".json"), 'r') as config_file:
            self.anomaly_config = json.load(config_file)

    def get_anomalies(self):
        labeled_anomalies_file = os.path.join(config.ANOMALIES_FOLDER, self.anomaly_name + "_labeled.json")
        if not os.path.exists(labeled_anomalies_file):
            return []
        with open(labeled_anomalies_file) as file:
            return json.load(file)

    def create_algorithm(self):
        from supervised_algorithm import supervised_algorithm
        return supervised_algorithm()

    def __save_model(self):
        logger.info("Save model '%s'" % self.anomaly_name)
        model_filename = os.path.join(config.MODELS_FOLDER, self.anomaly_name + ".m")
        self.model.save(model_filename)

    def __load_model(self):
        logger.info("Load model '%s'" % self.anomaly_name)
        model_filename = os.path.join(config.MODELS_FOLDER, self.anomaly_name + ".m")
        if os.path.exists(model_filename):
            self.model = self.create_algorithm()
            self.model.load(model_filename)
Add src 7 years ago			`import os.path`
			`from data_provider import DataProvider`
			`from data_preprocessor import data_preprocessor`
			`import json`
			`import pandas as pd`
			`import logging`
132-proxy-db-queries-through-grafana 7 years ago			`from urllib.parse import urlparse`
folders config++ 7 years ago			`import config`

Add src 7 years ago
			`logger = logging.getLogger('analytic_toolset')`


			`def anomalies_to_timestamp(anomalies):`
			`for anomaly in anomalies:`
			`anomaly['start'] = int(anomaly['start'].timestamp() * 1000)`
			`anomaly['finish'] = int(anomaly['finish'].timestamp() * 1000)`
			`return anomalies`


			`class AnomalyModel:`

			`def __init__(self, anomaly_name):`
			`self.anomaly_name = anomaly_name`
			`self.load_anomaly_config()`

132-proxy-db-queries-through-grafana 7 years ago			`parsedUrl = urlparse(self.anomaly_config['panelUrl'])`
			`origin = parsedUrl.scheme + '://' + parsedUrl.netloc`

			`datasource = self.anomaly_config['datasource']`
			`datasource['origin'] = origin`
Add src 7 years ago			`metric_name = self.anomaly_config['metric']['targets'][0]`

folders config++ 7 years ago			`target_filename = os.path.join(config.METRICS_FOLDER, metric_name + ".json")`
Add src 7 years ago
folders config++ 7 years ago			`dataset_filename = os.path.join(config.DATASET_FOLDER, metric_name + ".csv")`
			`augmented_path = os.path.join(config.DATASET_FOLDER, metric_name + "_augmented.csv")`
Add src 7 years ago
			`with open(target_filename, 'r') as file:`
			`target = json.load(file)`

132-proxy-db-queries-through-grafana 7 years ago			`self.data_prov = DataProvider(datasource, target, dataset_filename)`
Add src 7 years ago			`self.preprocessor = data_preprocessor(self.data_prov, augmented_path)`
			`self.model = None`

			`self.__load_model()`

			`def anomalies_box(self, anomalies):`
			`max_time = 0`
			`min_time = float("inf")`
			`for anomaly in anomalies:`
			`max_time = max(max_time, anomaly['finish'])`
			`min_time = min(min_time, anomaly['start'])`
			`min_time = pd.to_datetime(min_time, unit='ms')`
			`max_time = pd.to_datetime(max_time, unit='ms')`
			`return min_time, max_time`

			`def learn(self, anomalies):`
			`logger.info("Start to learn for anomaly_name='%s'" % self.anomaly_name)`

			`confidence = 0.02`
			`dataframe = self.data_prov.get_dataframe()`
			`start_index, stop_index = 0, len(dataframe)`
			`if len(anomalies) > 0:`
			`confidence = 0.0`
			`min_time, max_time = self.anomalies_box(anomalies)`
151 fix error on second learning (#5) 7 years ago			`dataframe = dataframe[dataframe['timestamp'] <= max_time]`
			`dataframe = dataframe[dataframe['timestamp'] >= min_time]`
Add src 7 years ago
			`train_augmented = self.preprocessor.get_augmented_data(`
151 fix error on second learning (#5) 7 years ago			`dataframe.index[0],`
			`dataframe.index[-1],`
Add src 7 years ago			`anomalies`
			`)`

			`self.model = self.create_algorithm()`
			`self.model.fit(train_augmented, confidence)`
			`if len(anomalies) > 0:`
minor fixes 6 years ago			`last_dataframe_time = dataframe.iloc[-1]['timestamp']`
Add src 7 years ago			`last_prediction_time = int(last_dataframe_time.timestamp() * 1000)`
			`else:`
			`last_prediction_time = 0`

			`self.__save_model()`
			`logger.info("Learning is finished for anomaly_name='%s'" % self.anomaly_name)`
			`return last_prediction_time`

			`def predict(self, last_prediction_time):`
			`logger.info("Start to predict for anomaly type='%s'" % self.anomaly_name)`
			`last_prediction_time = pd.to_datetime(last_prediction_time, unit='ms')`

			`start_index = self.data_prov.get_upper_bound(last_prediction_time)`
			`stop_index = self.data_prov.size()`

			`last_prediction_time = int(last_prediction_time.timestamp() * 1000)`

			`predicted_anomalies = []`
			`if start_index < stop_index:`
			`max_chunk_size = 50000`
			`predicted = pd.Series()`
			`for index in range(start_index, stop_index, max_chunk_size):`
			`chunk_start = index`
			`chunk_finish = min(index + max_chunk_size, stop_index)`
			`predict_augmented = self.preprocessor.get_augmented_data(chunk_start, chunk_finish)`

			`assert(len(predict_augmented) == chunk_finish - chunk_start)`

			`predicted_current = self.model.predict(predict_augmented)`
			`predicted = pd.concat([predicted, predicted_current])`
			`predicted_anomalies = self.preprocessor.inverse_transform_anomalies(predicted)`

			`last_row = self.data_prov.get_data_range(stop_index - 1, stop_index)`

			`last_dataframe_time = last_row.iloc[0]['timestamp']`
			`predicted_anomalies = anomalies_to_timestamp(predicted_anomalies)`
			`last_prediction_time = int(last_dataframe_time.timestamp() * 1000)`

			`logger.info("Predicting is finished for anomaly type='%s'" % self.anomaly_name)`
			`return predicted_anomalies, last_prediction_time`

			`def synchronize_data(self):`
			`self.data_prov.synchronize()`
			`self.preprocessor.set_data_provider(self.data_prov)`
			`self.preprocessor.synchronize()`

			`def load_anomaly_config(self):`
folders config++ 7 years ago			`with open(os.path.join(config.ANOMALIES_FOLDER, self.anomaly_name + ".json"), 'r') as config_file:`
Add src 7 years ago			`self.anomaly_config = json.load(config_file)`

			`def get_anomalies(self):`
folders config++ 7 years ago			`labeled_anomalies_file = os.path.join(config.ANOMALIES_FOLDER, self.anomaly_name + "_labeled.json")`
Add src 7 years ago			`if not os.path.exists(labeled_anomalies_file):`
			`return []`
			`with open(labeled_anomalies_file) as file:`
			`return json.load(file)`

			`def create_algorithm(self):`
			`from supervised_algorithm import supervised_algorithm`
			`return supervised_algorithm()`

			`def __save_model(self):`
			`logger.info("Save model '%s'" % self.anomaly_name)`
folders config++ 7 years ago			`model_filename = os.path.join(config.MODELS_FOLDER, self.anomaly_name + ".m")`
Add src 7 years ago			`self.model.save(model_filename)`

			`def __load_model(self):`
			`logger.info("Load model '%s'" % self.anomaly_name)`
folders config++ 7 years ago			`model_filename = os.path.join(config.MODELS_FOLDER, self.anomaly_name + ".m")`
Add src 7 years ago			`if os.path.exists(model_filename):`
			`self.model = self.create_algorithm()`
			`self.model.load(model_filename)`