Browse Source

Add src

pull/1/head
rozetko 6 years ago
parent
commit
af4ab07edb
  1. 3
      .gitignore
  2. 29
      README.md
  3. 198
      REST.md
  4. 27
      server/README.md
  5. 10
      server/build/dev-server.js
  6. 52
      server/build/webpack.base.conf.js
  7. 4
      server/build/webpack.dev.conf.js
  8. 3
      server/build/webpack.prod.conf.js
  9. 5093
      server/package-lock.json
  10. 32
      server/package.json
  11. 9
      server/src/config.ts
  12. 31
      server/src/index.ts
  13. 62
      server/src/routes/alerts.ts
  14. 136
      server/src/routes/anomalies.ts
  15. 80
      server/src/routes/segments.ts
  16. 58
      server/src/services/alerts.ts
  17. 141
      server/src/services/analytics.ts
  18. 117
      server/src/services/anomalyType.ts
  19. 55
      server/src/services/json.ts
  20. 27
      server/src/services/metrics.ts
  21. 140
      server/src/services/notification.ts
  22. 75
      server/src/services/segments.ts
  23. 10
      server/tsconfig.json
  24. 11
      src/.gitignore
  25. 5
      src/add_anomaly.py
  26. 157
      src/anomaly_model.py
  27. 255
      src/data_preprocessor.py
  28. 220
      src/data_provider.py
  29. 52
      src/learn.py
  30. 127
      src/pattern_detection_model.py
  31. 71
      src/peaks_detector.py
  32. 83
      src/predict.py
  33. 46
      src/prophet_algorithm.py
  34. 231
      src/step_detect.py
  35. 188
      src/step_detector.py
  36. 71
      src/supervised_algorithm.py
  37. 131
      src/worker.py

3
.gitignore vendored

@ -0,0 +1,3 @@
node_modules/
dist/
.vscode/

29
README.md

@ -0,0 +1,29 @@
# Hastic server
Implementation of basic pattern recognition and unsupervised learning for anomamaly detection.
Implementation of analytic unit for Hastic.
see [REST API](REST.md)
## Build & run
### Analytic unit
Python3 project
```
pip3 install pandas
pip3 install influxdb
```
### Server
Node.js project
```
cd server
npm install
npm run build
npm start
```

198
REST.md

@ -0,0 +1,198 @@
# Hastic server REST API
## /anomalies
### Get anomalies
`GET /anomalies?id=<anomaly_id>[&name=<anomaly_name>]`
NOTE: `name` param is deprecated, use `id` instead
Return data format:
```
{
"name": "<anomaly_name>",
"metric": "<metric_id>",
"status": "<str>"
}
```
status field can be one of:
- `learning`
- `ready`
- `failed`
### Get anomaly status
`GET /anomalies/status?id=<anomaly_id>[&name=<anomaly_name>]`
NOTE: `name` param is deprecated, use `id` instead
Return data format:
```
{
"status": <str>
}
```
status field can be one of:
- `learning`
- `ready`
- `failed`
### Add anomaly
`POST /anomalies`
Data format:
```
{
"name": "cpu_utilization_supervised",
"metric": {
"datasource": "influx accelerometer",
"targets": [
<targets>
]
},
"panelUrl": "http://grafana.example.com/d/oNZ35bWiz/new-dashboard-copy?panelId=2&fullscreen"
}
```
`targets` example:
```
{
"alias": "command",
"groupBy": [],
"measurement": "data",
"orderByTime": "ASC",
"policy": "default",
"refId": "A",
"resultFormat": "time_series",
"select": [
[
{
"params": [
"command"
],
"type": "field"
}
]
],
"tags": []
}
```
Return data format:
```
{
"anomaly_id": "<anomaly_id>"
}
```
### Delete anpmalies
`DELETE /anomalies`
Data format:
```
{
"id": "<anomaly_id>",
"name": "<anomaly_name>" // deprecated, use id instead
}
```
Return data format:
```
Success
```
## /segments
### Get segments
`GET /segments?anomaly_id=<anomaly_id>[&last_segment=<id>][&from=<time_from>][&to=<time_to>]`
Return data format:
```
{
"segments": [
{
"id": 0,
"start": 1392765184318,
"finish": 1397243699000,
"labeled": true
},
...
]
}
```
### Update segments
`PATCH /segments`
Data format:
```
{
"anomaly_id": "<anomaly_id>",
"name": "<anomaly_name>", // deprecated, use id instead
"added_segments": [
{
"start": 1397164656000,
"finish": 1397243699000
},
...
],
"removed_segments": [3, 9]
}
```
Return data format:
```
{
"added_ids": [12, ...]
}
```
## /alerts
### Check if alert is enabled for anomaly
`GET /alerts?anomaly_id=<anomaly_id>`
Return data format:
```
{
"enable": true
}
```
### Enable / disable alert for anomaly
`POST /alerts`
Data format:
```
{
"anomaly_id": "<anomaly_id>",
"enable": true
}
```
Return data format:
```
{
"status": "Ok"
}
```

27
server/README.md

@ -0,0 +1,27 @@
# Hastic server
REST server for managing data for analytics.
Running on 8000 port.
# Build
```
npm install
npm run build
```
# Run
```
npm start
```
# Development
You should have `nodemon` module installed to run development server.
```
npm i -g nodemon
npm run dev
```

10
server/build/dev-server.js

@ -0,0 +1,10 @@
const { spawn } = require('child_process');
const webpack = spawn('webpack', ['--config', 'build/webpack.dev.conf.js'], {
stdio: 'inherit',
shell: true
});
//webpack.stdout.pipe(process.stdout);
const nodemon = spawn('nodemon', ['../dist/server', '--watch', 'server.js']);
nodemon.stdout.pipe(process.stdout);

52
server/build/webpack.base.conf.js

@ -0,0 +1,52 @@
const path = require('path');
const fs = require('fs');
const webpack = require('webpack');
function resolve(p) {
return path.join(__dirname, '/../', p);
}
module.exports = {
target: 'node',
node: {
__dirname: false,
__filename: false,
},
context: resolve('./src'),
entry: './index',
devtool: 'inline-source-map',
output: {
filename: "server.js",
path: resolve('dist')
},
externals: [
function(context, request, callback) {
if(request[0] == '.') {
callback();
} else {
callback(null, "require('" + request + "')");
}
}
],
plugins: [
new webpack.optimize.OccurrenceOrderPlugin(),
new webpack.HotModuleReplacementPlugin(),
new webpack.DefinePlugin({
'process.env.NODE_ENV': JSON.stringify('development')
})
],
resolve: {
extensions: [".ts", ".js"]
},
module: {
rules: [
{
test: /\.ts$/,
loader: "ts-loader",
exclude: /node_modules/
}
]
}
}

4
server/build/webpack.dev.conf.js

@ -0,0 +1,4 @@
var base = require('./webpack.base.conf');
base.watch = true;
module.exports = base;

3
server/build/webpack.prod.conf.js

@ -0,0 +1,3 @@
var base = require('./webpack.base.conf');
module.exports = base;

5093
server/package-lock.json generated

File diff suppressed because it is too large Load Diff

32
server/package.json

@ -0,0 +1,32 @@
{
"name": "hastic-server",
"version": "1.0.0",
"description": "REST server for managing data for analytics",
"scripts": {
"start": "node dist/server.js",
"dev": "node build/dev-server.js",
"build": "webpack --config build/webpack.prod.conf.js"
},
"repository": {
"type": "git",
"url": "git+https://github.com/hastic/hastic-server.git"
},
"author": "CorpGlory",
"license": "ISC",
"bugs": {
"url": "https://github.com/hastic/hastic-server/issues"
},
"homepage": "https://github.com/hastic/hastic-server#readme",
"dependencies": {
"express": "^4.16.3",
"fast-csv": "^2.4.1",
"telegraf": "^3.21.0"
},
"devDependencies": {
"@types/express": "^4.11.1",
"nodemon": "^1.17.3",
"ts-loader": "^3.5.0",
"typescript": "^2.8.3",
"webpack": "^3.5.6"
}
}

9
server/src/config.ts

@ -0,0 +1,9 @@
import * as path from 'path';
const DATA_PATH = path.join(__dirname, '../data');
const ANALYTICS_PATH = path.join(__dirname, '../../src');
const ANOMALIES_PATH = path.join(ANALYTICS_PATH, 'anomalies');
const SEGMENTS_PATH = path.join(ANALYTICS_PATH, 'segments');
const METRICS_PATH = path.join(ANALYTICS_PATH, 'metrics');
export { DATA_PATH, ANALYTICS_PATH, ANOMALIES_PATH, SEGMENTS_PATH, METRICS_PATH }

31
server/src/index.ts

@ -0,0 +1,31 @@
import * as express from 'express';
import * as bodyParser from 'body-parser';
import { router as anomaliesRouter } from './routes/anomalies';
import { router as segmentsRouter } from './routes/segments';
import { router as alertsRouter } from './routes/alerts';
import { tgBotInit } from './services/notification';
const app = express();
const PORT = 8000;
app.use(bodyParser.json());
app.use(bodyParser.urlencoded({ extended: true }));
app.use(function (req, res, next) {
res.header('Access-Control-Allow-Origin', '*');
res.header('Access-Control-Allow-Methods', 'GET, POST, PUT, DELETE, PATCH, OPTIONS');
res.header('Access-Control-Allow-Headers', 'Origin, X-Requested-With, Content-Type, Accept');
next();
});
app.use('/anomalies', anomaliesRouter);
app.use('/segments', segmentsRouter);
app.use('/alerts', alertsRouter);
app.use('/', (req, res) => { res.send('Analytic unit works') });
app.listen(PORT, () => {
console.log(`Server is running on :${PORT}`)
});
tgBotInit();

62
server/src/routes/alerts.ts

@ -0,0 +1,62 @@
import * as express from 'express';
import {AnomalyId, getAnomalyIdByName, loadAnomalyById} from '../services/anomalyType';
import { getAlertsAnomalies, saveAlertsAnomalies } from '../services/alerts';
function getAlert(req, res) {
try {
let anomalyId: AnomalyId = req.query.anomaly_id;
let anomaly = loadAnomalyById(anomalyId)
if (anomaly == null) {
anomalyId = getAnomalyIdByName(anomalyId.toLowerCase());
}
let alertsAnomalies = getAlertsAnomalies();
let pos = alertsAnomalies.indexOf(anomalyId);
let enable: boolean = (pos !== -1);
res.status(200).send({
enable
});
} catch(e) {
res.status(500).send({
code: 500,
message: 'Internal error'
});
}
}
function changeAlert(req, res) {
try {
let anomalyId: AnomalyId = req.body.anomaly_id;
let enable: boolean = req.body.enable;
let anomaly = loadAnomalyById(anomalyId)
if (anomaly == null) {
anomalyId = getAnomalyIdByName(anomalyId.toLowerCase());
}
let alertsAnomalies = getAlertsAnomalies();
let pos: number = alertsAnomalies.indexOf(anomalyId);
if(enable && pos == -1) {
alertsAnomalies.push(anomalyId);
saveAlertsAnomalies(alertsAnomalies);
} else if(!enable && pos > -1) {
alertsAnomalies.splice(pos, 1);
saveAlertsAnomalies(alertsAnomalies);
}
res.status(200).send({
status: 'Ok'
});
} catch(e) {
res.status(500).send({
code: 500,
message: 'Internal error'
});
}
}
export const router = express.Router();
router.get('/', getAlert);
router.post('/', changeAlert);

136
server/src/routes/anomalies.ts

@ -0,0 +1,136 @@
import * as express from 'express';
import {
Metric,
Anomaly,
saveAnomaly,
insertAnomaly, removeAnomaly, loadAnomalyByName, loadAnomalyById, getAnomalyIdByName
} from '../services/anomalyType';
import { runLearning } from '../services/analytics'
import { saveTargets } from '../services/metrics';
async function sendAnomalyTypeStatus(req, res) {
let id = req.query.id;
let name = req.query.name;
try {
let anomaly: Anomaly;
if(id !== undefined) {
anomaly = loadAnomalyById(id);
} else {
anomaly = loadAnomalyByName(name);
}
if(anomaly === null) {
res.status(404).send({
code: 404,
message: 'Not found'
});
return;
}
if(anomaly.status === undefined) {
throw new Error('No status for ' + name);
}
res.status(200).send({ status: anomaly.status });
} catch(e) {
console.error(e);
// TODO: better send 404 when we know than isn`t found
res.status(500).send({ error: 'Can`t return anything' });
}
}
async function getAnomaly(req, res) {
try {
let id = req.query.id;
let name = req.query.name;
let anomaly:Anomaly;
if(id !== undefined) {
anomaly = loadAnomalyById(id);
} else {
anomaly = loadAnomalyByName(name.toLowerCase());
}
if(anomaly === null) {
res.status(404).send({
code: 404,
message: 'Not found'
});
return;
}
let payload = JSON.stringify({
name: anomaly.name,
metric: anomaly.metric,
status: anomaly.status
});
res.status(200).send(payload)
} catch(e) {
console.error(e);
// TODO: better send 404 when we know than isn`t found
res.status(500).send('Can`t get anything');
}
}
async function createAnomaly(req, res) {
try {
const metric:Metric = {
datasource: req.body.metric.datasource,
targets: saveTargets(req.body.metric.targets)
};
const anomaly:Anomaly = {
name: req.body.name,
panelUrl: req.body.panelUrl,
metric: metric,
status: 'learning',
last_prediction_time: 0,
next_id: 0
};
let anomalyId = insertAnomaly(anomaly);
if(anomalyId === null) {
res.status(403).send({
code: 403,
message: 'Already exists'
});
}
let payload = JSON.stringify({ anomaly_id: anomalyId })
res.status(200).send(payload);
runLearning(anomalyId);
} catch(e) {
res.status(500).send({
code: 500,
message: 'Internal error'
});
}
}
function deleteAnomaly(req, res) {
try {
let id = req.query.id;
let name = req.query.name;
if(id !== undefined) {
removeAnomaly(id);
} else {
removeAnomaly(name.toLowerCase());
}
res.status(200).send({
code: 200,
message: 'Success'
});
} catch(e) {
res.status(500).send({
code: 500,
message: 'Internal error'
});
}
}
export const router = express.Router();
router.get('/status', sendAnomalyTypeStatus);
router.get('/', getAnomaly);
router.post('/', createAnomaly);
router.delete('/', deleteAnomaly);

80
server/src/routes/segments.ts

@ -0,0 +1,80 @@
import * as express from 'express';
import {
getLabeledSegments,
insertSegments,
removeSegments,
} from '../services/segments';
import {runLearning} from '../services/analytics';
import {Anomaly, AnomalyId, getAnomalyIdByName, loadAnomalyById} from '../services/anomalyType';
async function sendSegments(req, res) {
try {
let anomalyId: AnomalyId = req.query.anomaly_id;
let anomaly:Anomaly = loadAnomalyById(anomalyId);
if(anomaly === null) {
anomalyId = getAnomalyIdByName(anomalyId);
}
let lastSegmentId = req.query.last_segment;
let timeFrom = req.query.from;
let timeTo = req.query.to;
let segments = getLabeledSegments(anomalyId);
// Id filtering
if(lastSegmentId !== undefined) {
segments = segments.filter(el => el.id > lastSegmentId);
}
// Time filtering
if(timeFrom !== undefined) {
segments = segments.filter(el => el.finish > timeFrom);
}
if(timeTo !== undefined) {
segments = segments.filter(el => el.start < timeTo);
}
let payload = JSON.stringify({
segments
});
res.status(200).send(payload);
} catch(e) {
res.status(500).send({
code: 500,
message: 'Internal error'
});
}
}
async function updateSegments(req, res) {
try {
let segmentsUpdate = req.body;
let anomalyId = segmentsUpdate.anomaly_id;
let anomalyName = segmentsUpdate.name;
if(anomalyId === undefined) {
anomalyId = getAnomalyIdByName(anomalyName.toLowerCase());
}
let addedIds = insertSegments(anomalyId, segmentsUpdate.added_segments, true);
removeSegments(anomalyId, segmentsUpdate.removed_segments);
let payload = JSON.stringify({ added_ids: addedIds });
res.status(200).send(payload);
runLearning(anomalyId);
} catch(e) {
res.status(500).send({
code: 500,
message: 'Internal error'
});
}
}
export const router = express.Router();
router.get('/', sendSegments);
router.patch('/', updateSegments);

58
server/src/services/alerts.ts

@ -0,0 +1,58 @@
import { getJsonDataSync, writeJsonDataSync } from './json';
import * as path from 'path';
import { AnomalyId } from './anomalyType';
import { ANOMALIES_PATH } from '../config';
import { runPredict } from './analytics';
import { sendNotification } from './notification';
import { getLabeledSegments } from './segments';
function getAlertsAnomalies() : AnomalyId[] {
return getJsonDataSync(path.join(ANOMALIES_PATH, `alerts_anomalies.json`));
}
function saveAlertsAnomalies(anomalies: AnomalyId[]) {
return writeJsonDataSync(path.join(ANOMALIES_PATH, `alerts_anomalies.json`), anomalies);
}
function processAlerts(anomalyId) {
let segments = getLabeledSegments(anomalyId);
const currentTime = new Date().getTime();
const activeAlert = activeAlerts.has(anomalyId);
let newActiveAlert = false;
if(segments.length > 0) {
let lastSegment = segments[segments.length - 1];
if(lastSegment.finish >= currentTime - alertTimeout) {
newActiveAlert = true;
}
}
if(!activeAlert && newActiveAlert) {
activeAlerts.add(anomalyId);
sendNotification(anomalyId, true);
} else if(activeAlert && !newActiveAlert) {
activeAlerts.delete(anomalyId);
sendNotification(anomalyId, false);
}
}
async function alertsTick() {
let alertsAnomalies = getAlertsAnomalies();
for (let anomalyId of alertsAnomalies) {
try {
await runPredict(anomalyId);
processAlerts(anomalyId);
} catch (e) {
console.error(e);
}
}
setTimeout(alertsTick, 5000);
}
const alertTimeout = 60000; // ms
const activeAlerts = new Set<string>();
setTimeout(alertsTick, 5000);
export { getAlertsAnomalies, saveAlertsAnomalies }

141
server/src/services/analytics.ts

@ -0,0 +1,141 @@
import { spawn } from 'child_process'
import { ANALYTICS_PATH } from '../config'
import {
Anomaly,
AnomalyId, getAnomalyTypeInfo,
loadAnomalyById,
setAnomalyPredictionTime,
setAnomalyStatus
} from './anomalyType'
import { getTarget } from './metrics';
import { getLabeledSegments, insertSegments, removeSegments } from './segments';
import { split, map, mapSync } from 'event-stream'
const learnWorker = spawn('python3', ['worker.py'], { cwd: ANALYTICS_PATH })
learnWorker.stdout.pipe(split())
.pipe(
mapSync(function(line){
console.log(line)
onMessage(line)
})
);
learnWorker.stderr.on('data', data => console.error(`worker stderr: ${data}`));
const taskMap = {};
let nextTaskId = 0;
function onMessage(data) {
let response = JSON.parse(data);
let taskId = response.__task_id;
// let anomalyName = response.anomaly_name;
// let task = response.task;
let status = response.status;
if(status === 'success' || status === 'failed') {
if(taskId in taskMap) {
let resolver = taskMap[taskId];
resolver(response);
delete taskMap[taskId];
}
}
}
function runTask(task) : Promise<any> {
let anomaly:Anomaly = loadAnomalyById(task.anomaly_id);
task.metric = {
datasource: anomaly.metric.datasource,
targets: anomaly.metric.targets.map(t => getTarget(t))
};
task.__task_id = nextTaskId++;
let command = JSON.stringify(task)
learnWorker.stdin.write(`${command}\n`);
return new Promise<Object>((resolve, reject) => {
taskMap[task.__task_id] = resolve
})
}
async function runLearning(anomalyId:AnomalyId) {
let segments = getLabeledSegments(anomalyId);
setAnomalyStatus(anomalyId, 'learning');
let anomaly:Anomaly = loadAnomalyById(anomalyId);
let analyticsType = "anomalies";
let preset = undefined;
if (anomaly.name.includes("jumps")) {
analyticsType = "patterns";
preset = "steps"
}
if (anomaly.name.includes("cliffs") || anomaly.name.includes("drops")) {
analyticsType = "patterns";
preset = "cliffs"
}
if (anomaly.name.includes("peaks")) {
analyticsType = "patterns";
preset = "peaks"
}
let task = {
type: 'learn',
anomaly_id: anomalyId,
analytics_type: analyticsType,
preset,
segments: segments
};
let result = await runTask(task);
if (result.status === 'success') {
setAnomalyStatus(anomalyId, 'ready');
insertSegments(anomalyId, result.segments, false);
setAnomalyPredictionTime(anomalyId, result.last_prediction_time);
} else {
setAnomalyStatus(anomalyId, 'failed');
}
}
async function runPredict(anomalyId:AnomalyId) {
let anomaly:Anomaly = loadAnomalyById(anomalyId);
let analyticsType = "anomalies";
let preset = undefined;
if (anomaly.name.includes("jump")) {
analyticsType = "patterns";
preset = "steps"
}
if (anomaly.name.includes("cliffs") || anomaly.name.includes("drops")) {
analyticsType = "patterns";
preset = "cliffs"
}
if (anomaly.name.includes("peaks")) {
analyticsType = "patterns";
preset = "peaks"
}
let task = {
type: 'predict',
anomaly_id: anomalyId,
analytics_type: analyticsType,
preset,
last_prediction_time: anomaly.last_prediction_time
};
let result = await runTask(task);
if(result.status === 'failed') {
return [];
}
// Merging segments
let segments = getLabeledSegments(anomalyId);
if(segments.length > 0 && result.segments.length > 0) {
let lastOldSegment = segments[segments.length - 1];
let firstNewSegment = result.segments[0];
if(firstNewSegment.start <= lastOldSegment.finish) {
result.segments[0].start = lastOldSegment.start;
removeSegments(anomalyId, [lastOldSegment.id]);
}
}
insertSegments(anomalyId, result.segments, false);
setAnomalyPredictionTime(anomalyId, result.last_prediction_time);
return result.segments;
}
export { runLearning, runPredict }

117
server/src/services/anomalyType.ts

@ -0,0 +1,117 @@
import * as path from 'path'
import { getJsonDataSync, writeJsonDataSync } from './json'
import { ANOMALIES_PATH } from '../config'
import * as fs from 'fs'
import * as crypto from 'crypto';
export type Metric = {
datasource: string,
targets: string[]
}
export type Anomaly = {
name: string,
panelUrl: string,
metric: Metric,
status: string,
last_prediction_time: number,
next_id: number
}
export type AnomalyId = string;
let anomaliesNameToIdMap = {};
function loadAnomaliesMap() {
let filename = path.join(ANOMALIES_PATH, `all_anomalies.json`);
anomaliesNameToIdMap = getJsonDataSync(filename);
}
function saveAnomaliesMap() {
let filename = path.join(ANOMALIES_PATH, `all_anomalies.json`);
writeJsonDataSync(filename, anomaliesNameToIdMap);
}
function getAnomalyIdByName(anomalyName:string) : AnomalyId {
loadAnomaliesMap();
anomalyName = anomalyName.toLowerCase();
if(anomalyName in anomaliesNameToIdMap) {
return anomaliesNameToIdMap[anomalyName];
}
return anomalyName;
}
function insertAnomaly(anomaly: Anomaly) : AnomalyId {
const hashString = anomaly.name + (new Date()).toString();
const anomalyId:AnomalyId = crypto.createHash('md5').update(hashString).digest('hex');
anomaliesNameToIdMap[anomaly.name] = anomalyId;
saveAnomaliesMap();
// return anomalyId
// const anomalyId:AnomalyId = anomaly.name;
let filename = path.join(ANOMALIES_PATH, `${anomalyId}.json`);
if(fs.existsSync(filename)) {
return null;
}
saveAnomaly(anomalyId, anomaly);
return anomalyId;
}
function removeAnomaly(anomalyId:AnomalyId) {
let filename = path.join(ANOMALIES_PATH, `${anomalyId}.json`);
fs.unlinkSync(filename);
}
function saveAnomaly(anomalyId: AnomalyId, anomaly: Anomaly) {
let filename = path.join(ANOMALIES_PATH, `${anomalyId}.json`);
return writeJsonDataSync(filename, anomaly);
}
function loadAnomalyById(anomalyId: AnomalyId) : Anomaly {
let filename = path.join(ANOMALIES_PATH, `${anomalyId}.json`);
if(!fs.existsSync(filename)) {
return null;
}
return getJsonDataSync(filename);
}
function loadAnomalyByName(anomalyName: string) : Anomaly {
let anomalyId = getAnomalyIdByName(anomalyName);
return loadAnomalyById(anomalyId);
}
function saveAnomalyTypeInfo(info) {
console.log('Saving');
let filename = path.join(ANOMALIES_PATH, `${info.name}.json`);
if(info.next_id === undefined) {
info.next_id = 0;
}
if(info.last_prediction_time === undefined) {
info.last_prediction_time = 0;
}
return writeJsonDataSync(filename, info);
}
function getAnomalyTypeInfo(name) {
return getJsonDataSync(path.join(ANOMALIES_PATH, `${name}.json`));
}
function setAnomalyStatus(anomalyId:AnomalyId, status:string) {
let info = loadAnomalyById(anomalyId);
info.status = status;
saveAnomaly(anomalyId, info);
}
function setAnomalyPredictionTime(anomalyId:AnomalyId, lastPredictionTime:number) {
let info = loadAnomalyById(anomalyId);
info.last_prediction_time = lastPredictionTime;
saveAnomaly(anomalyId, info);
}
export {
saveAnomaly, loadAnomalyById, loadAnomalyByName, insertAnomaly, removeAnomaly, saveAnomalyTypeInfo,
getAnomalyTypeInfo, getAnomalyIdByName, setAnomalyStatus, setAnomalyPredictionTime
}

55
server/src/services/json.ts

@ -0,0 +1,55 @@
import * as fs from 'fs';
async function getJsonData(filename: string): Promise<Object> {
var data = await new Promise<string>((resolve, reject) => {
fs.readFile(filename, 'utf8', (err, data) => {
if(err) {
console.error(err);
reject('Can`t read file');
} else {
resolve(data);
}
});
});
try {
return JSON.parse(data);
} catch(e) {
console.error(e);
throw new Error('Wrong file format');
}
}
function writeJsonData(filename: string, data: Object) {
return new Promise((resolve, reject) => {
fs.writeFile(filename, JSON.stringify(data), 'utf8', (err) => {
if(err) {
console.error(err);
reject('Cat`t write file');
} else {
resolve();
}
});
})
}
function getJsonDataSync(filename: string) {
let data = fs.readFileSync(filename, 'utf8');
try {
return JSON.parse(data);
} catch(e) {
console.error(e);
throw new Error('Wrong file format');
}
}
function writeJsonDataSync(filename: string, data: Object) {
fs.writeFileSync(filename, JSON.stringify(data));
}
export {
getJsonData,
writeJsonData,
getJsonDataSync,
writeJsonDataSync
}

27
server/src/services/metrics.ts

@ -0,0 +1,27 @@
import * as path from 'path';
import { getJsonDataSync, writeJsonDataSync } from './json';
import { METRICS_PATH } from '../config';
import * as crypto from 'crypto';
function saveTargets(targets) {
let metrics = [];
for (let target of targets) {
metrics.push(saveTarget(target));
}
return metrics;
}
function saveTarget(target) {
//const md5 = crypto.createHash('md5')
const targetId = crypto.createHash('md5').update(JSON.stringify(target)).digest('hex');
let filename = path.join(METRICS_PATH, `${targetId}.json`);
writeJsonDataSync(filename, target);
return targetId;
}
function getTarget(targetId) {
let filename = path.join(METRICS_PATH, `${targetId}.json`);
return getJsonDataSync(filename);
}
export { saveTargets, getTarget }

140
server/src/services/notification.ts

@ -0,0 +1,140 @@
//import * as Telegraf from 'telegraf'
import * as path from 'path';
import { DATA_PATH } from '../config';
import { getJsonDataSync, writeJsonDataSync } from './json';
import { AnomalyId } from './anomalyType';
type SubscriberId = string;
type SubscribersMap = Map< AnomalyId, SubscriberId[] >;
type BotConfig = {
token: string,
subscriptions: SubscribersMap
};
function sendNotification(anomalyName, active) {
console.log('Notification ' + anomalyName);
if(anomalyName in botConfig.subscriptions) {
let notificationMessage;
if(active) {
notificationMessage = 'Alert! Anomaly type ' + anomalyName;
} else {
notificationMessage = 'Ok! Anomaly type ' + anomalyName;
}
for (let SubscriberId of botConfig.subscriptions[anomalyName]) {
bot.telegram.sendMessage(SubscriberId, notificationMessage);
}
}
}
function loadBotConfig() : BotConfig {
let filename = path.join(DATA_PATH, `bot_config.json`);
let jsonData;
try {
jsonData = getJsonDataSync(filename);
} catch(e) {
console.error(e.message);
jsonData = [];
}
return jsonData;
}
function saveBotConfig(botConfig: BotConfig) {
let filename = path.join(DATA_PATH, `bot_config.json`);
try {
writeJsonDataSync(filename, botConfig);
} catch(e) {
console.error(e.message);
}
}
const commandArgs = (ctx, next) => {
try {
if(ctx.updateType === 'message') {
const text = ctx.update.message.text;
if(text !== undefined && text.startsWith('/')) {
const match = text.match(/^\/([^\s]+)\s?(.+)?/);
let args = [];
let command;
if(match !== null) {
if(match[1]) {
command = match[1];
}
if(match[2]) {
args = match[2].split(' ');
}
}
ctx.state.command = {
raw: text,
command,
args,
};
}
}
return next(ctx);
} catch (e) {
}
};
function addNotification(ctx) {
console.log('addNotification')
let command = ctx.state.command;
let chatId = ctx.chat.id;
if(command.args.length > 0) {
for (let anomalyName of command.args) {
if(!(anomalyName in botConfig.subscriptions)) {
botConfig.subscriptions[anomalyName] = []
}
if(botConfig.subscriptions[anomalyName].includes(chatId)) {
return ctx.reply('You are already subscribed on alerts from anomaly ' + command.args)
} else {
botConfig.subscriptions[anomalyName].push(chatId);
saveBotConfig(botConfig);
}
}
return ctx.reply('You have been successfully subscribed on alerts from anomaly ' + command.args)
} else {
return ctx.reply('You should use syntax: \/addNotification <anomaly_name>')
}
}
function removeNotification(ctx) {
let command = ctx.state.command;
let chatId = ctx.chat.id;
if(command.args.length > 0) {
for (let anomalyName of command.args) {
if(anomalyName in botConfig.subscriptions) {
botConfig.subscriptions[anomalyName] = botConfig.subscriptions[anomalyName].filter(el => el !== chatId);
saveBotConfig(botConfig);
}
}
return ctx.reply('You have been successfully unsubscribed from alerts from ' + command.args);
} else {
return ctx.reply('You should use syntax: \/removeNotification <anomaly_name>');
}
}
const Telegraf = require('telegraf');
let botConfig: BotConfig;
let bot;
function tgBotInit() {
try {
botConfig = loadBotConfig();
bot = new Telegraf(botConfig.token);
bot.use(commandArgs);
bot.command('addNotification', addNotification);
bot.command('removeNotification', removeNotification);
bot.startPolling();
} catch(e) {
// TODO: handle exception
}
}
export { sendNotification, tgBotInit }

75
server/src/services/segments.ts

@ -0,0 +1,75 @@
import * as path from 'path';
import { getJsonDataSync, writeJsonDataSync } from './json';
import { SEGMENTS_PATH } from '../config';
import { AnomalyId, loadAnomalyById, saveAnomaly } from './anomalyType';
function getLabeledSegments(anomalyId: AnomalyId) {
let filename = path.join(SEGMENTS_PATH, `${anomalyId}_labeled.json`);
let segments = [];
try {
segments = getJsonDataSync(filename);
for (let segment of segments) {
if (segment.labeled === undefined) {
segment.labeled = false;
}
}
} catch (e) {
console.error(e.message);
}
return segments;
}
function getPredictedSegments(anomalyId: AnomalyId) {
let filename = path.join(SEGMENTS_PATH, `${anomalyId}_segments.json`);
let jsonData;
try {
jsonData = getJsonDataSync(filename);
} catch(e) {
console.error(e.message);
jsonData = [];
}
return jsonData;
}
function saveSegments(anomalyId: AnomalyId, segments) {
let filename = path.join(SEGMENTS_PATH, `${anomalyId}_labeled.json`);
try {
return writeJsonDataSync(filename, segments);
} catch(e) {
console.error(e.message);
throw new Error('Can`t write to db');
}
}
function insertSegments(anomalyId: AnomalyId, addedSegments, labeled:boolean) {
// Set status
let info = loadAnomalyById(anomalyId);
let segments = getLabeledSegments(anomalyId);
let nextId = info.next_id;
let addedIds = []
for (let segment of addedSegments) {
segment.id = nextId;
segment.labeled = labeled;
addedIds.push(nextId);
nextId++;
segments.push(segment);
}
info.next_id = nextId;
saveSegments(anomalyId, segments);
saveAnomaly(anomalyId, info);
return addedIds;
}
function removeSegments(anomalyId: AnomalyId, removedSegments) {
let segments = getLabeledSegments(anomalyId);
for (let segmentId of removedSegments) {
segments = segments.filter(el => el.id !== segmentId);
}
saveSegments(anomalyId, segments);
}
export { getLabeledSegments, getPredictedSegments, saveSegments, insertSegments, removeSegments }

10
server/tsconfig.json

@ -0,0 +1,10 @@
{
"compilerOptions": {
"outDir": "./dist/",
"sourceMap": true,
"noImplicitAny": false,
"module": "commonjs",
"target": "es2015",
"allowJs": true
}
}

11
src/.gitignore vendored

@ -0,0 +1,11 @@
anomalies/
segments/
datasets/
datasources/
models/
metrics/
__pycache__/
*.pyc
*.txt
*.log
tasks.csv

5
src/add_anomaly.py

@ -0,0 +1,5 @@
from worker import worker
if __name__ == "__main__":
w = worker()
w.do_task({"type": "learn", "anomaly_name": "cpu_utilization_supervised", "segments": []})

157
src/anomaly_model.py

@ -0,0 +1,157 @@
import os.path
from data_provider import DataProvider
from data_preprocessor import data_preprocessor
import json
import pandas as pd
import logging
datasource_folder = "datasources/"
dataset_folder = "datasets/"
anomalies_folder = "anomalies/"
models_folder = "models/"
metrics_folder = "metrics/"
logger = logging.getLogger('analytic_toolset')
def anomalies_to_timestamp(anomalies):
for anomaly in anomalies:
anomaly['start'] = int(anomaly['start'].timestamp() * 1000)
anomaly['finish'] = int(anomaly['finish'].timestamp() * 1000)
return anomalies
class AnomalyModel:
def __init__(self, anomaly_name):
self.anomaly_name = anomaly_name
self.load_anomaly_config()
datasource = self.anomaly_config['metric']['datasource']
metric_name = self.anomaly_config['metric']['targets'][0]
dbconfig_filename = os.path.join(datasource_folder, datasource + ".json")
target_filename = os.path.join(metrics_folder, metric_name + ".json")
dataset_filename = os.path.join(dataset_folder, metric_name + ".csv")
augmented_path = os.path.join(dataset_folder, metric_name + "_augmented.csv")
with open(dbconfig_filename, 'r') as config_file:
dbconfig = json.load(config_file)
with open(target_filename, 'r') as file:
target = json.load(file)
self.data_prov = DataProvider(dbconfig, target, dataset_filename)
self.preprocessor = data_preprocessor(self.data_prov, augmented_path)
self.model = None
self.__load_model()
def anomalies_box(self, anomalies):
max_time = 0
min_time = float("inf")
for anomaly in anomalies:
max_time = max(max_time, anomaly['finish'])
min_time = min(min_time, anomaly['start'])
min_time = pd.to_datetime(min_time, unit='ms')
max_time = pd.to_datetime(max_time, unit='ms')
return min_time, max_time
def learn(self, anomalies):
logger.info("Start to learn for anomaly_name='%s'" % self.anomaly_name)
confidence = 0.02
dataframe = self.data_prov.get_dataframe()
start_index, stop_index = 0, len(dataframe)
if len(anomalies) > 0:
confidence = 0.0
min_time, max_time = self.anomalies_box(anomalies)
start_index = dataframe[dataframe['timestamp'] >= min_time].index[0]
stop_index = dataframe[dataframe['timestamp'] > max_time].index[0]
start_index, stop_index = self.preprocessor.expand_indexes(start_index, stop_index)
dataframe = dataframe[start_index:stop_index]
train_augmented = self.preprocessor.get_augmented_data(
start_index,
stop_index,
anomalies
)
self.model = self.create_algorithm()
self.model.fit(train_augmented, confidence)
if len(anomalies) > 0:
last_dataframe_time = dataframe.iloc[- 1]['timestamp']
last_prediction_time = int(last_dataframe_time.timestamp() * 1000)
else:
last_prediction_time = 0
self.__save_model()
logger.info("Learning is finished for anomaly_name='%s'" % self.anomaly_name)
return last_prediction_time
def predict(self, last_prediction_time):
logger.info("Start to predict for anomaly type='%s'" % self.anomaly_name)
last_prediction_time = pd.to_datetime(last_prediction_time, unit='ms')
start_index = self.data_prov.get_upper_bound(last_prediction_time)
stop_index = self.data_prov.size()
# last_prediction_time = pd.to_datetime(last_prediction_time, unit='ms')
# dataframe = dataframe[dataframe['timestamp'] > last_prediction_time]
last_prediction_time = int(last_prediction_time.timestamp() * 1000)
predicted_anomalies = []
if start_index < stop_index:
max_chunk_size = 50000
predicted = pd.Series()
for index in range(start_index, stop_index, max_chunk_size):
chunk_start = index
chunk_finish = min(index + max_chunk_size, stop_index)
predict_augmented = self.preprocessor.get_augmented_data(chunk_start, chunk_finish)
assert(len(predict_augmented) == chunk_finish - chunk_start)
predicted_current = self.model.predict(predict_augmented)
predicted = pd.concat([predicted, predicted_current])
predicted_anomalies = self.preprocessor.inverse_transform_anomalies(predicted)
last_row = self.data_prov.get_data_range(stop_index - 1, stop_index)
last_dataframe_time = last_row.iloc[0]['timestamp']
predicted_anomalies = anomalies_to_timestamp(predicted_anomalies)
last_prediction_time = int(last_dataframe_time.timestamp() * 1000)
logger.info("Predicting is finished for anomaly type='%s'" % self.anomaly_name)
return predicted_anomalies, last_prediction_time
def synchronize_data(self):
self.data_prov.synchronize()
self.preprocessor.set_data_provider(self.data_prov)
self.preprocessor.synchronize()
def load_anomaly_config(self):
with open(os.path.join(anomalies_folder, self.anomaly_name + ".json"), 'r') as config_file:
self.anomaly_config = json.load(config_file)
def get_anomalies(self):
labeled_anomalies_file = os.path.join(anomalies_folder, self.anomaly_name + "_labeled.json")
if not os.path.exists(labeled_anomalies_file):
return []
with open(labeled_anomalies_file) as file:
return json.load(file)
def create_algorithm(self):
from supervised_algorithm import supervised_algorithm
return supervised_algorithm()
def __save_model(self):
logger.info("Save model '%s'" % self.anomaly_name)
model_filename = os.path.join(models_folder, self.anomaly_name + ".m")
self.model.save(model_filename)
def __load_model(self):
logger.info("Load model '%s'" % self.anomaly_name)
model_filename = os.path.join(models_folder, self.anomaly_name + ".m")
if os.path.exists(model_filename):
self.model = self.create_algorithm()
self.model.load(model_filename)

255
src/data_preprocessor.py

@ -0,0 +1,255 @@
import os.path
import pandas as pd
import numpy as np
import math
import time
from tsfresh.transformers.feature_augmenter import FeatureAugmenter
from tsfresh.feature_extraction.settings import from_columns
from pytz import timezone
class data_preprocessor:
# augmented = None
frame_size = 16
calc_features = [
# "value__agg_linear_trend__f_agg_\"max\"__chunk_len_5__attr_\"intercept\"",
# "value__cwt_coefficients__widths_(2, 5, 10, 20)__coeff_12__w_20",
# "value__cwt_coefficients__widths_(2, 5, 10, 20)__coeff_13__w_5",
# "value__cwt_coefficients__widths_(2, 5, 10, 20)__coeff_2__w_10",
# "value__cwt_coefficients__widths_(2, 5, 10, 20)__coeff_2__w_20",
# "value__cwt_coefficients__widths_(2, 5, 10, 20)__coeff_8__w_20",
# "value__fft_coefficient__coeff_3__attr_\"abs\"",
"time_of_day_column_x",
"time_of_day_column_y",
"value__abs_energy",
"value__absolute_sum_of_changes",
"value__sum_of_reoccurring_data_points",
]
time_features = [
'time_of_day_column_x',
'time_of_day_column_y'
]
chunk_size = 50000
def __init__(self, data_provider, augmented_path):
self.data_provider = data_provider
self.augmented_path = augmented_path
self.last_chunk_index = 0
self.total_size = 0
self.__init_chunks()
self.synchronize()
def set_data_provider(self, data_provider):
self.data_provider = data_provider
def synchronize(self):
start_frame = self.total_size
stop_frame = self.data_provider.size()
max_chunk_size = 30000
for frame in range(start_frame, stop_frame, max_chunk_size):
data = self.__get_source_frames(frame, min(stop_frame, frame + max_chunk_size))
if len(data) == 0:
return
append_augmented = self.__extract_features(data, self.calc_features)
self.__append_data(append_augmented)
def expand_indexes(self, start_index, stop_index):
return start_index, stop_index
def get_augmented_data(self, start_index, stop_index, anomalies=[]):
start_frame = start_index
stop_frame = stop_index
augmented = self.__get_data(start_frame, stop_frame)
if len(anomalies) > 0:
anomalies_indexes = self.transform_anomalies(anomalies)
augmented = augmented.drop(anomalies_indexes)
return augmented
def transform_anomalies(self, anomalies):
anomaly_index = None
dataframe = self.data_provider.get_dataframe(None)
for anomaly in anomalies:
start_time = pd.to_datetime(anomaly['start'], unit='ms')
finish_time = pd.to_datetime(anomaly['finish'], unit='ms')
current_index = (dataframe['timestamp'] >= start_time) & (dataframe['timestamp'] <= finish_time)
if anomaly_index is not None:
anomaly_index = (anomaly_index | current_index)
else:
anomaly_index = current_index
rows = dataframe[anomaly_index]
# indexes = np.floor_divide(rows.index, self.frame_size)
indexes = np.unique(rows.index)
return indexes
def inverse_transform_anomalies(self, prediction):
anomalies = []
cur_anomaly = None
source_dataframe = self.data_provider.get_dataframe(None)
for i in prediction.index:
if prediction[i]:
start_frame_index = max(0, i - self.frame_size + 1)
finish_frame_index = i
start = source_dataframe['timestamp'][start_frame_index]
finish = source_dataframe['timestamp'][finish_frame_index]
if cur_anomaly is None:
if len(anomalies) > 0 and start <= anomalies[len(anomalies) - 1]['finish']:
cur_anomaly = anomalies[len(anomalies) - 1]
anomalies.pop()
else:
cur_anomaly = {'start': start, 'finish': finish}
cur_anomaly['finish'] = finish
elif cur_anomaly is not None:
anomalies.append(cur_anomaly)
cur_anomaly = None
if cur_anomaly:
anomalies.append(cur_anomaly)
return anomalies
def __get_data(self, start_index, stop_index):
result = pd.DataFrame()
start_chunk = start_index // self.chunk_size
finish_chunk = stop_index // self.chunk_size
for chunk_num in range(start_chunk, finish_chunk + 1):
chunk = self.__load_chunk(chunk_num)
if chunk_num == finish_chunk:
chunk = chunk[:stop_index % self.chunk_size]
if chunk_num == start_chunk:
chunk = chunk[start_index % self.chunk_size:]
result = pd.concat([result, chunk])
return result
def __init_chunks(self):
chunk_index = 0
self.last_chunk_index = 0
while True:
filename = self.augmented_path
if chunk_index > 0:
filename += "." + str(chunk_index)
if os.path.exists(filename):
self.last_chunk_index = chunk_index
else:
break
chunk_index += 1
self.total_size = self.last_chunk_index * self.chunk_size
last_chunk = self.__load_chunk(self.last_chunk_index)
self.total_size += len(last_chunk)
def __append_data(self, dataframe):
while len(dataframe) > 0:
chunk = self.__load_chunk(self.last_chunk_index)
rows_count = min(self.chunk_size - len(chunk), len(dataframe))
rows = dataframe.iloc[0:rows_count]
self.__save_chunk(self.last_chunk_index, rows)
self.total_size += rows_count
dataframe = dataframe[rows_count:]
if len(dataframe) > 0:
self.last_chunk_index += 1
def __load_chunk(self, index):
filename = self.augmented_path
if index > 0:
filename += "." + str(index)
if os.path.exists(filename):
chunk = pd.read_csv(filename)
frame_index = np.arange(index * self.chunk_size, index * self.chunk_size + len(chunk))
chunk = chunk.set_index(frame_index)
return chunk
return pd.DataFrame()
def __save_chunk(self, index, dataframe):
filename = self.augmented_path
if index > 0:
filename += "." + str(index)
if os.path.exists(filename):
dataframe.to_csv(filename, mode='a', index=False, header=False)
else:
dataframe.to_csv(filename, mode='w', index=False, header=True)
def __get_source_frames(self, start_frame, stop_frame):
start_index = start_frame
stop_index = stop_frame
# frame = self.source_dataframe[start_index:stop_index]
# mat = frame.as_matrix()
source_dataframe = self.data_provider.get_data_range(max(start_index - self.frame_size + 1, 0), stop_index)
dataframe = None
for i in range(start_index, stop_index):
mini = max(0, i - self.frame_size + 1)
frame = source_dataframe.loc[mini:i + 1].copy()
frame['id'] = i
if dataframe is None:
dataframe = frame
else:
dataframe = dataframe.append(frame, ignore_index=True)
#dataframe = self.source_dataframe[start_index:stop_index].copy()
#dataframe['id'] = np.floor_divide(dataframe.index, self.frame_size)
dataframe.reset_index(drop=True, inplace=True)
return dataframe
def __extract_features(self, data, features=None):
start_frame = data['id'][0]
stop_frame = data['id'][len(data)-1] + 1
augmented = pd.DataFrame(index=np.arange(start_frame, stop_frame))
# tsfresh features
tsfresh_features = None
if features is not None:
tsfresh_features = set(features) - set(self.time_features)
augmented = self.__extract_tfresh_features(data, augmented, tsfresh_features)
# time features
augmented = self.__extract_time_features(data, augmented, features)
return augmented
def __extract_tfresh_features(self, data, augmented, features):
relevant_extraction_settings = None
if features is not None:
augmented_features = set(features)
relevant_extraction_settings = from_columns(augmented_features)
#impute_function = partial(impute_dataframe_range, col_to_max=self.col_to_max,
# col_to_min=self.col_to_min, col_to_median=self.col_to_median)
feature_extractor = FeatureAugmenter(
kind_to_fc_parameters=relevant_extraction_settings,
column_id='id',
column_sort='timestamp')
feature_extractor.set_timeseries_container(data)
return feature_extractor.transform(augmented)
def __extract_time_features(self, data, augmented, features):
if features is None:
features = self.time_features
seconds = np.zeros(len(augmented))
first_id = data['id'][0]
for i in range(len(data)):
id = data['id'][i] - first_id
timeobj = data['timestamp'][i].time()
seconds[id] = timeobj.second + 60 * (timeobj.minute + 60 * timeobj.hour)
norm_seconds = 2 * math.pi * seconds / (24 * 3600)
if 'time_of_day_column_x' in features:
augmented['time_of_day_column_x'] = np.cos(norm_seconds)
if 'time_of_day_column_y' in features:
augmented['time_of_day_column_y'] = np.sin(norm_seconds)
return augmented

220
src/data_provider.py

@ -0,0 +1,220 @@
from influxdb import InfluxDBClient
import pandas as pd
import os.path
import numpy as np
class DataProvider:
chunk_size = 50000
def __init__(self, dbconfig, target, data_filename):
self.dbconfig = dbconfig
self.target = target
self.data_filename = data_filename
self.last_time = None
self.total_size = 0
self.last_chunk_index = 0
self.chunk_last_times = {}
self.__init_chunks()
self.synchronize()
def get_dataframe(self, after_time=None):
result = pd.DataFrame()
for chunk_index, last_chunk_time in self.chunk_last_times.items():
if after_time is None or after_time <= last_chunk_time:
chunk = self.__load_chunk(chunk_index)
if after_time is not None:
chunk = chunk[chunk['timestamp'] > after_time]
result = pd.concat([result, chunk])
return result
def get_upper_bound(self, after_time):
for chunk_index, last_chunk_time in self.chunk_last_times.items():
if after_time < last_chunk_time:
chunk = self.__load_chunk(chunk_index)
chunk = chunk[chunk['timestamp'] > after_time]
return chunk.index[0]
return self.size()
def size(self):
return self.total_size
def get_data_range(self, start_index, stop_index=None):
return self.__get_data(start_index, stop_index)
def transform_anomalies(self, anomalies):
result = []
if len(anomalies) == 0:
return result
dataframe = self.get_dataframe(None)
for anomaly in anomalies:
start_time = pd.to_datetime(anomaly['start']-1, unit='ms')
finish_time = pd.to_datetime(anomaly['finish']+1, unit='ms')
current_index = (dataframe['timestamp'] >= start_time) & (dataframe['timestamp'] <= finish_time)
anomaly_frame = dataframe[current_index]
cur_anomaly = {
'start': anomaly_frame.index[0],
'finish': anomaly_frame.index[len(anomaly_frame) - 1],
'labeled': anomaly['labeled']
}
result.append(cur_anomaly)
return result
def inverse_transform_indexes(self, indexes):
if len(indexes) == 0:
return []
dataframe = self.get_data_range(indexes[0][0], indexes[-1][1] + 1)
return [(dataframe['timestamp'][i1], dataframe['timestamp'][i2]) for (i1, i2) in indexes]
def synchronize(self):
# last_time = None
# if len(self.dataframe/) > 0:
# last_time = self.dataframe['time'][len(self.dataframe)-1]
append_dataframe = self.load_from_db(self.last_time)
self.__append_data(append_dataframe)
# append_dataframe
# append_dataframe.to_csv(self.data_filename, mode='a', index=False, header=False)
# self.dataframe = pd.concat([self.dataframe, append_dataframe], ignore_index=True)
# def load(self):
# if os.path.exists(self.data_filename):
# self.dataframe = pd.read_csv(self.data_filename, parse_dates=[0])
# self.synchronize()
# else:
# append_dataframe = self.load_from_db()
# self.__append_data(append_dataframe)
# #self.dataframe.to_csv(self.data_filename, index=False, header=True)
def custom_query(self, after_time):
query = self.target["query"]
timeFilter = "TRUE"
if after_time is not None:
timeFilter = "time > '%s'" % (str(after_time))
query = query.replace("$timeFilter", timeFilter)
return query
def load_from_db(self, after_time=None):
"""Instantiate a connection to the InfluxDB."""
host = self.dbconfig['host']
port = self.dbconfig['port']
user = self.dbconfig['user']
password = self.dbconfig['password']
dbname = self.dbconfig['dbname']
client = InfluxDBClient(host, port, user, password, dbname)
# query = 'select k0, k1, k2 from vals;'
measurement = self.target['measurement']
select = self.target['select']
tags = self.target['tags']
if "query" in self.target:
query = self.custom_query(after_time)
else:
select_values = select[0][0]['params']
escaped_select_values = ["\"" + value + "\"" for value in select_values]
conditions_entries = []
if len(tags) > 0:
for tag in tags:
conditions_entries.append("(\"" + tag['key'] + "\"" + tag['operator'] + "'" + tag['value'] + "')")
if after_time:
conditions_entries.append("time > '%s'" % (str(after_time)))
condition = ""
if len(conditions_entries) > 0:
condition = " where " + " AND ".join(conditions_entries)
query = "select %s from \"%s\"%s;" % (",".join(escaped_select_values), measurement, condition)
result = client.query(query, chunked=True, chunk_size=10000)
dataframe = pd.DataFrame(result.get_points())
if len(dataframe) > 0:
cols = dataframe.columns.tolist()
cols.remove('time')
cols = ['time'] + cols
dataframe = dataframe[cols]
dataframe['time'] = pd.to_datetime(dataframe['time'])
dataframe = dataframe.dropna(axis=0, how='any')
return dataframe
def __init_chunks(self):
chunk_index = 0
self.last_chunk_index = 0
while True:
filename = self.data_filename
if chunk_index > 0:
filename += "." + str(chunk_index)
if os.path.exists(filename):
self.last_chunk_index = chunk_index
chunk = self.__load_chunk(chunk_index)
chunk_last_time = chunk.iloc[len(chunk) - 1]['timestamp']
self.chunk_last_times[chunk_index] = chunk_last_time
self.last_time = chunk_last_time
else:
break
chunk_index += 1
self.total_size = self.last_chunk_index * self.chunk_size
last_chunk = self.__load_chunk(self.last_chunk_index)
self.total_size += len(last_chunk)
def __load_chunk(self, index):
filename = self.data_filename
if index > 0:
filename += "." + str(index)
if os.path.exists(filename):
chunk = pd.read_csv(filename, parse_dates=[0])
frame_index = np.arange(index * self.chunk_size, index * self.chunk_size + len(chunk))
chunk = chunk.set_index(frame_index)
return chunk.rename(columns={chunk.columns[0]: "timestamp", chunk.columns[1]: "value"})
return pd.DataFrame()
def __save_chunk(self, index, dataframe):
filename = self.data_filename
if index > 0:
filename += "." + str(index)
chunk_last_time = dataframe.iloc[len(dataframe) - 1]['time']
self.chunk_last_times[index] = chunk_last_time
if os.path.exists(filename):
dataframe.to_csv(filename, mode='a', index=False, header=False)
else:
dataframe.to_csv(filename, mode='w', index=False, header=True)
def __append_data(self, dataframe):
while len(dataframe) > 0:
chunk = self.__load_chunk(self.last_chunk_index)
rows_count = min(self.chunk_size - len(chunk), len(dataframe))
rows = dataframe.iloc[0:rows_count]
if len(rows) > 0:
self.__save_chunk(self.last_chunk_index, rows)
self.total_size += rows_count
self.last_time = rows.iloc[-1]['time']
dataframe = dataframe[rows_count:]
if len(dataframe) > 0:
self.last_chunk_index += 1
def __get_data(self, start_index, stop_index):
result = pd.DataFrame()
start_chunk = start_index // self.chunk_size
finish_chunk = self.last_chunk_index
if stop_index is not None:
finish_chunk = stop_index // self.chunk_size
for chunk_num in range(start_chunk, finish_chunk + 1):
chunk = self.__load_chunk(chunk_num)
if stop_index is not None and chunk_num == finish_chunk:
chunk = chunk[:stop_index % self.chunk_size]
if chunk_num == start_chunk:
chunk = chunk[start_index % self.chunk_size:]
result = pd.concat([result, chunk])
return result

52
src/learn.py

@ -0,0 +1,52 @@
#!/usr/bin/env python
import csv
import os
from worker import worker
def enqueue_task():
tasks_file = "tasks.csv"
tasks = []
with open(tasks_file) as csvfile:
rdr = csv.reader(csvfile, delimiter=',')
tasks = list(rdr)
if len(tasks) == 0:
return None
res = tasks[0][0]
tasks = tasks[1:]
with open(tasks_file, "w+") as csvfile:
writer = csv.writer(csvfile)
writer.writerows(tasks)
return res
def set_lock(value):
lock_file = "learn.lock"
exists = os.path.exists(lock_file)
if exists == value:
return False
if value:
open(lock_file, "w+")
else:
os.remove(lock_file)
return True
if __name__ == "__main__":
if not set_lock(True):
print("learn locked")
exit(0)
w = worker()
while True:
task = enqueue_task()
if task is None:
break
w.start()
w.add_task({"type": "learn", "anomaly_name": task})
w.add_task({"type": "predict", "anomaly_name": task})
w.stop()
set_lock(False)

127
src/pattern_detection_model.py

@ -0,0 +1,127 @@
from data_provider import DataProvider
import logging
import os.path
import json
import pandas as pd
datasource_folder = "datasources/"
dataset_folder = "datasets/"
anomalies_folder = "anomalies/"
models_folder = "models/"
metrics_folder = "metrics/"
logger = logging.getLogger('analytic_toolset')
def segments_box(segments):
max_time = 0
min_time = float("inf")
for segment in segments:
min_time = min(min_time, segment['start'])
max_time = max(max_time, segment['finish'])
min_time = pd.to_datetime(min_time, unit='ms')
max_time = pd.to_datetime(max_time, unit='ms')
return min_time, max_time
class PatternDetectionModel:
def __init__(self, pattern_name, preset=None):
self.pattern_name = pattern_name
self.preset = preset
self.__load_anomaly_config()
datasource = self.anomaly_config['metric']['datasource']
metric_name = self.anomaly_config['metric']['targets'][0]
dbconfig_filename = os.path.join(datasource_folder, datasource + ".json")
target_filename = os.path.join(metrics_folder, metric_name + ".json")
dataset_filename = os.path.join(dataset_folder, metric_name + ".csv")
with open(dbconfig_filename, 'r') as config_file:
dbconfig = json.load(config_file)
with open(target_filename, 'r') as file:
target = json.load(file)
self.data_prov = DataProvider(dbconfig, target, dataset_filename)
self.model = None
self.__load_model(preset)
def learn(self, segments):
self.model = self.__create_model(self.preset)
window_size = 200
dataframe = self.data_prov.get_dataframe()
start_index, stop_index = 0, len(dataframe)
if len(segments) > 0:
min_time, max_time = segments_box(segments)
start_index = dataframe[dataframe['timestamp'] >= min_time].index[0]
stop_index = dataframe[dataframe['timestamp'] > max_time].index[0]
start_index = max(start_index - window_size, 0)
stop_index = min(stop_index + window_size, len(dataframe))
dataframe = dataframe[start_index:stop_index]
segments = self.data_prov.transform_anomalies(segments)
self.model.fit(dataframe, segments)
self.__save_model()
return 0
# return last_prediction_time
def predict(self, last_prediction_time):
if self.model is None:
return [], last_prediction_time
window_size = 100
last_prediction_time = pd.to_datetime(last_prediction_time, unit='ms')
start_index = self.data_prov.get_upper_bound(last_prediction_time)
start_index = max(0, start_index - window_size)
dataframe = self.data_prov.get_data_range(start_index)
predicted_indexes = self.model.predict(dataframe)
predicted_indexes = [(x, y) for (x, y) in predicted_indexes if x >= start_index and y >= start_index]
predicted_times = self.data_prov.inverse_transform_indexes(predicted_indexes)
segments = []
for time_value in predicted_times:
ts1 = int(time_value[0].timestamp() * 1000)
ts2 = int(time_value[1].timestamp() * 1000)
segments.append({
'start': ts1,
'finish': ts2
})
last_dataframe_time = dataframe.iloc[- 1]['timestamp']
last_prediction_time = int(last_dataframe_time.timestamp() * 1000)
return segments, last_prediction_time
# return predicted_anomalies, last_prediction_time
def synchronize_data(self):
self.data_prov.synchronize()
def __create_model(self, preset):
if preset == "peaks":
from peaks_detector import PeaksDetector
return PeaksDetector()
if preset == "steps" or preset == "cliffs":
from step_detector import StepDetector
return StepDetector(preset)
def __load_anomaly_config(self):
with open(os.path.join(anomalies_folder, self.pattern_name + ".json"), 'r') as config_file:
self.anomaly_config = json.load(config_file)
def __save_model(self):
logger.info("Save model '%s'" % self.pattern_name)
model_filename = os.path.join(models_folder, self.pattern_name + ".m")
self.model.save(model_filename)
def __load_model(self, preset):
logger.info("Load model '%s'" % self.pattern_name)
model_filename = os.path.join(models_folder, self.pattern_name + ".m")
if os.path.exists(model_filename):
self.model = self.__create_model(preset)
self.model.load(model_filename)

71
src/peaks_detector.py

@ -0,0 +1,71 @@
from scipy import signal
import numpy as np
import step_detect
class PeaksDetector:
def __init__(self):
pass
def fit(self, dataset, contamination=0.005):
pass
def predict(self, dataframe):
array = dataframe['value'].as_matrix()
window_size = 20
# window = np.ones(101)
# mean_filtered = signal.fftconvolve(
# np.concatenate([np.zeros(window_size), array, np.zeros(window_size)]),
# window,
# mode='valid'
# )
# filtered = np.divide(array, mean_filtered / 101)
window = signal.general_gaussian(2 * window_size + 1, p=0.5, sig=5)
#print(window)
filtered = signal.fftconvolve(array, window, mode='valid')
# filtered = np.concatenate([
# np.zeros(window_size),
# filtered,
# np.zeros(window_size)
# ])
filtered = filtered / np.sum(window)
array = array[window_size:-window_size]
filtered = np.subtract(array, filtered)
import matplotlib.pyplot as plt
# filtered = np.convolve(array, step, mode='valid')
# print(len(array))
# print(len(filtered))
# step = np.hstack((np.ones(window_size), 0, -1*np.ones(window_size)))
#
# conv = np.convolve(array, step, mode='valid')
#
# conv = np.concatenate([
# np.zeros(window_size),
# conv,
# np.zeros(window_size)])
#data = step_detect.t_scan(array, window=window_size)
data = filtered
data /= data.max()
#plt.plot(array[:1000])
plt.plot(data[:1000])
plt.show()
result = step_detect.find_steps(data, 0.1)
return [dataframe.index[x + window_size] for x in result]
def save(self, model_filename):
pass
# with open(model_filename, 'wb') as file:
# pickle.dump((self.clf, self.scaler), file)
def load(self, model_filename):
pass
# with open(model_filename, 'rb') as file:
# self.clf, self.scaler = pickle.load(file)

83
src/predict.py

@ -0,0 +1,83 @@
import argparse
import csv
import time
import datetime
import pandas as pd
import matplotlib.pyplot as plt
from influxdb import InfluxDBClient
from sklearn import svm
import numpy as np
import math
import pickle
host = "209.205.120.226"
port = 8086
datasetFile = "/tmp/dataset.csv"
anomaliesFile = "anomalies.csv"
predictedAnomaliesFile = "predicted_anomalies.csv"
modelFilename = 'finalized_model.sav'
def readAnomalies():
anomalies = []
with open(anomaliesFile) as csvfile:
rdr = csv.reader(csvfile, delimiter=',')
for row in rdr:
anomaly = (int(row[0]), int(row[1]))
anomalies.append(anomaly)
return anomalies
"""Instantiate a connection to the InfluxDB."""
user = ''
password = ''
dbname = 'accelerometer'
query = 'select k0, k1, k2 from vals limit 10000;'
client = InfluxDBClient(host, port, user, password, dbname)
def predict(host=host, port=port):
result = client.query(query)
df = pd.DataFrame(result['vals'], columns=['time', 'k0', 'k1', 'k2'])
basedAnomalies = readAnomalies()
df2 = df.rolling(200, win_type='triang').sum()
df2['time'] = pd.to_datetime(df2['time'])
df2 = df2[np.isfinite(df2['k0'])]
print(len(df2))
anomalies = []
last_anomaly = (-1, -1)
with open(modelFilename, 'rb') as fid:
clf = pickle.load(fid)
prediction = clf.predict(df2[['k0', 'k1', 'k2']])
print(len(prediction))
#print(prediction)
for i in range(len(prediction)):
if prediction[i] > 0.:
t = df2['time'][i + 199].timestamp()
t = ((t + 0 * 3600) * 1000)
if t < basedAnomalies[len(basedAnomalies) - 1][1]:
continue
if t < last_anomaly[1] + 1000:
last_anomaly = (last_anomaly[0], t)
else:
if last_anomaly[1] != -1:
anomalies.append(last_anomaly)
last_anomaly = (t, t)
with open(predictedAnomaliesFile, "w") as file:
for anomaly in anomalies:
file.write(str(int(anomaly[0])) + "," + str(int(anomaly[1])) + "\n")
predict()

46
src/prophet_algorithm.py

@ -0,0 +1,46 @@
from fbprophet import Prophet
import pandas as pd
class prophet_algorithm(object):
def __init__(self):
self.model = None
self.dataset = None
def fit(self, data, anomalies):
pass
def predict(self, data):
data = data.reset_index()
data = data.rename(columns={'timestamp': 'ds', 'value': 'y'})
self.dataset = data
self.model = Prophet(yearly_seasonality=False, weekly_seasonality=False, daily_seasonality=True)
self.model.fit(self.dataset)
future = self.model.make_future_dataframe(freq='H', periods=0, include_history=True)
forecast = self.model.predict(future)
cmp_df = forecast.set_index('ds')[['yhat', 'yhat_lower', 'yhat_upper']].join(self.dataset.set_index('ds'))
cmp_df['e'] = [ max(row.y - row.yhat_upper, row.yhat_lower - row.y, 0) for index, row in cmp_df.iterrows() ]
return self.__calc_anomalies(cmp_df)
def __calc_anomalies(self, dataset):
anomalies = []
cur_anomaly = None
for i in range(len(dataset)):
if dataset['e'][i] > 17:
if cur_anomaly is None:
cur_anomaly = {'start': dataset.index[i], 'finish': dataset.index[i], 'weight': 0}
cur_anomaly['finish'] = dataset.index[i]
cur_anomaly['weight'] += dataset['e'][i]
elif cur_anomaly is not None:
anomalies.append(cur_anomaly)
cur_anomaly = None
return anomalies
if __name__ == "__main__":
dataset = pd.read_csv('art_daily_flatmiddle.csv', index_col=['timestamp'], parse_dates=['timestamp'])
algo = prophet_algorithm(dataset)
res = algo.fit()
print(res)

231
src/step_detect.py

@ -0,0 +1,231 @@
"""
Thomas Kahn
thomas.b.kahn@gmail.com
"""
from __future__ import absolute_import
from math import sqrt
import multiprocessing as mp
import numpy as np
from six.moves import range
from six.moves import zip
def t_scan(L, window = 1e3, num_workers = -1):
"""
Computes t statistic for i to i+window points versus i-window to i
points for each point i in input array. Uses multiple processes to
do this calculation asynchronously. Array is decomposed into window
number of frames, each consisting of points spaced at window
intervals. This optimizes the calculation, as the drone function
need only compute the mean and variance for each set once.
Parameters
----------
L : numpy array
1 dimensional array that represents time series of datapoints
window : int / float
Number of points that comprise the windows of data that are
compared
num_workers : int
Number of worker processes for multithreaded t_stat computation
Defult value uses num_cpu - 1 workers
Returns
-------
t_stat : numpy array
Array which holds t statistic values for each point. The first
and last (window) points are replaced with zero, since the t
statistic calculation cannot be performed in that case.
"""
size = L.size
window = int(window)
frames = list(range(window))
n_cols = (size // window) - 1
t_stat = np.zeros((window, n_cols))
if num_workers == 1:
results = [_t_scan_drone(L, n_cols, frame, window) for frame in frames]
else:
if num_workers == -1:
num_workers = mp.cpu_count() - 1
pool = mp.Pool(processes = num_workers)
results = [pool.apply_async(_t_scan_drone, args=(L, n_cols, frame, window)) for frame in frames]
results = [r.get() for r in results]
pool.close()
for index, row in results:
t_stat[index] = row
t_stat = np.concatenate((
np.zeros(window),
t_stat.transpose().ravel(order='C'),
np.zeros(size % window)
))
return t_stat
def _t_scan_drone(L, n_cols, frame, window=1e3):
"""
Drone function for t_scan. Not Intended to be called manually.
Computes t_scan for the designated frame, and returns result as
array along with an integer tag for proper placement in the
aggregate array
"""
size = L.size
window = int(window)
root_n = sqrt(window)
output = np.zeros(n_cols)
b = L[frame:window+frame]
b_mean = b.mean()
b_var = b.var()
for i in range(window+frame, size-window, window):
a = L[i:i+window]
a_mean = a.mean()
a_var = a.var()
output[i // window - 1] = root_n * (a_mean - b_mean) / sqrt(a_var + b_var)
b_mean, b_var = a_mean, a_var
return frame, output
def mz_fwt(x, n=2):
"""
Computes the multiscale product of the Mallat-Zhong discrete forward
wavelet transform up to and including scale n for the input data x.
If n is even, the spikes in the signal will be positive. If n is odd
the spikes will match the polarity of the step (positive for steps
up, negative for steps down).
This function is essentially a direct translation of the MATLAB code
provided by Sadler and Swami in section A.4 of the following:
http://www.dtic.mil/dtic/tr/fulltext/u2/a351960.pdf
Parameters
----------
x : numpy array
1 dimensional array that represents time series of data points
n : int
Highest scale to multiply to
Returns
-------
prod : numpy array
The multiscale product for x
"""
N_pnts = x.size
lambda_j = [1.5, 1.12, 1.03, 1.01][0:n]
if n > 4:
lambda_j += [1.0]*(n-4)
H = np.array([0.125, 0.375, 0.375, 0.125])
G = np.array([2.0, -2.0])
Gn = [2]
Hn = [3]
for j in range(1,n):
q = 2**(j-1)
Gn.append(q+1)
Hn.append(3*q+1)
S = np.concatenate((x[::-1], x))
S = np.concatenate((S, x[::-1]))
prod = np.ones(N_pnts)
for j in range(n):
n_zeros = 2**j - 1
Gz = _insert_zeros(G, n_zeros)
Hz = _insert_zeros(H, n_zeros)
current = (1.0/lambda_j[j])*np.convolve(S,Gz)
current = current[N_pnts+Gn[j]:2*N_pnts+Gn[j]]
prod *= current
if j == n-1:
break
S_new = np.convolve(S, Hz)
S_new = S_new[N_pnts+Hn[j]:2*N_pnts+Hn[j]]
S = np.concatenate((S_new[::-1], S_new))
S = np.concatenate((S, S_new[::-1]))
return prod
def _insert_zeros(x, n):
"""
Helper function for mz_fwt. Splits input array and adds n zeros
between values.
"""
newlen = (n+1)*x.size
out = np.zeros(newlen)
indices = list(range(0, newlen-n, n+1))
out[indices] = x
return out
def find_steps(array, threshold):
"""
Finds local maxima by segmenting array based on positions at which
the threshold value is crossed. Note that this thresholding is
applied after the absolute value of the array is taken. Thus,
the distinction between upward and downward steps is lost. However,
get_step_sizes can be used to determine directionality after the
fact.
Parameters
----------
array : numpy array
1 dimensional array that represents time series of data points
threshold : int / float
Threshold value that defines a step
Returns
-------
steps : list
List of indices of the detected steps
"""
steps = []
array = np.abs(array)
above_points = np.where(array > threshold, 1, 0)
ap_dif = np.diff(above_points)
cross_ups = np.where(ap_dif == 1)[0]
cross_dns = np.where(ap_dif == -1)[0]
for upi, dni in zip(cross_ups,cross_dns):
steps.append(np.argmax(array[upi:dni]) + upi)
return steps
def get_step_sizes(array, indices, window=1000):
"""
Calculates step size for each index within the supplied list. Step
size is determined by averaging over a range of points (specified
by the window parameter) before and after the index of step
occurrence. The directionality of the step is reflected by the sign
of the step size (i.e. a positive value indicates an upward step,
and a negative value indicates a downward step). The combined
standard deviation of both measurements (as a measure of uncertainty
in step calculation) is also provided.
Parameters
----------
array : numpy array
1 dimensional array that represents time series of data points
indices : list
List of indices of the detected steps (as provided by
find_steps, for example)
window : int, optional
Number of points to average over to determine baseline levels
before and after step.
Returns
-------
step_sizes : list
List of the calculated sizes of each step
step_error : list
"""
step_sizes = []
step_error = []
indices = sorted(indices)
last = len(indices) - 1
for i, index in enumerate(indices):
if i == 0:
q = min(window, indices[i+1]-index)
elif i == last:
q = min(window, index - indices[i-1])
else:
q = min(window, index-indices[i-1], indices[i+1]-index)
a = array[index:index+q]
b = array[index-q:index]
step_sizes.append(a.mean() - b.mean())
step_error.append(sqrt(a.var()+b.var()))
return step_sizes, step_error

188
src/step_detector.py

@ -0,0 +1,188 @@
import numpy as np
import pickle
def find_segments(array, threshold):
segments = []
above_points = np.where(array > threshold, 1, 0)
ap_dif = np.diff(above_points)
cross_ups = np.where(ap_dif == 1)[0]
cross_dns = np.where(ap_dif == -1)[0]
for upi, dni in zip(cross_ups,cross_dns):
segments.append((upi, dni))
return segments
def is_intersect(target_segment, segments):
for segment in segments:
start = max(segment['start'], target_segment[0])
finish = min(segment['finish'], target_segment[1])
if start <= finish:
return True
return False
def calc_intersections(segments, finded_segments):
intersections = 0
labeled = 0
for segment in segments:
if not segment['labeled']:
continue
labeled += 1
intersect = False
for finded_segment in finded_segments:
start = max(segment['start'], finded_segment[0])
finish = min(segment['finish'], finded_segment[1])
if start <= finish:
intersect = True
break
if intersect:
intersections += 1
return intersections, labeled
def cost_function(segments, finded_segments):
intersections, labeled = calc_intersections(segments, finded_segments)
return intersections == labeled
def compress_segments(segments):
result = []
for segment in segments:
if len(result) == 0 or result[len(result) - 1][1] < segment[0]:
result.append(segment)
else:
result[len(result) - 1] = (result[len(result) - 1][0], segment[1])
return result
class StepDetector:
def __init__(self, preset):
self.preset = preset
self.mean = None
self.window_size = None
self.corr_max = None
self.threshold = None
self.segments = []
def fit(self, dataframe, segments, contamination=0.01):
array = dataframe['value'].as_matrix()
self.mean = array.mean()
self.segments = segments
norm_data = (array - self.mean)
self.__optimize(norm_data, segments, contamination)
# print(self.threshold)
# import matplotlib.pyplot as plt
# fig, ax = plt.subplots(figsize=[18, 16])
# ax = fig.add_subplot(2, 1, 1)
# ax.plot(array)
# ax = fig.add_subplot(2, 1, 2, sharex=ax)
# ax.plot(corr_res)
# plt.show()
# #print(R.size)
# # Nw = 20
# # result = R[Nw,Nw:-1]
# # result[0] = 0
# #ax.plot(result)
# #print(len(data))
# #print(len(R))
#
# print(self.window_size)
# print(self.threshold)
def predict(self, dataframe):
array = dataframe['value'].as_matrix()
norm_data = (array - self.mean)
step_size = self.window_size // 2
pattern = np.concatenate([[-1] * step_size, [1] * step_size])
corr_res = np.correlate(norm_data, pattern, mode='valid') / self.window_size
corr_res = np.concatenate((np.zeros(step_size), corr_res, np.zeros(step_size)))
corr_res /= self.corr_max
result = self.__predict(corr_res, self.threshold)
# import matplotlib.pyplot as plt
# fig, ax = plt.subplots(figsize=[18, 16])
# ax = fig.add_subplot(2, 1, 1)
# ax.plot(array[:70000])
# ax = fig.add_subplot(2, 1, 2, sharex=ax)
# ax.plot(corr_res[:70000])
# plt.show()
result.sort()
result = compress_segments(result)
if len(self.segments) > 0:
result = [segment for segment in result if not is_intersect(segment, self.segments)]
return result
def __optimize(self, data, segments, contamination):
window_size = 10
mincost = None
while window_size < 100:
# print(window_size)
cost = self.__optimize_threshold(data, window_size, segments, contamination)
if mincost is None or cost < mincost:
mincost = cost
self.window_size = window_size
window_size = int(window_size * 1.2)
self.__optimize_threshold(data, self.window_size, segments, contamination)
def __optimize_threshold(self, data, window_size, segments, contamination):
step_size = window_size // 2
pattern = np.concatenate([[-1] * step_size, [1] * step_size])
corr_res = np.correlate(data, pattern, mode='same') / window_size
corr_res = np.concatenate((np.zeros(step_size), corr_res, np.zeros(step_size)))
self.corr_max = corr_res.max()
corr_res /= self.corr_max
N = 20
lower = 0.
upper = 1.
cost = 0
for i in range(0, N):
self.threshold = 0.5 * (lower + upper)
result = self.__predict(corr_res, self.threshold)
if len(segments) > 0:
intersections, labeled = calc_intersections(segments, result)
good = intersections == labeled
cost = len(result)
else:
total_sum = 0
for segment in result:
total_sum += (segment[1] - segment[0])
good = total_sum > len(data) * contamination
cost = -self.threshold
if good:
lower = self.threshold
else:
upper = self.threshold
return cost
def __predict(self, data, threshold):
segments = find_segments(data, threshold)
segments += find_segments(data * -1, threshold)
#segments -= 1
return [(x - 1, y - 1) for (x, y) in segments]
def save(self, model_filename):
with open(model_filename, 'wb') as file:
pickle.dump((self.mean, self.window_size, self.corr_max, self.threshold), file)
def load(self, model_filename):
try:
with open(model_filename, 'rb') as file:
self.mean, self.window_size, self.corr_max, self.threshold = pickle.load(file)
except:
pass

71
src/supervised_algorithm.py

@ -0,0 +1,71 @@
import pickle
from tsfresh.transformers.feature_selector import FeatureSelector
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import IsolationForest
import pandas as pd
from sklearn import svm
class supervised_algorithm(object):
frame_size = 16
good_features = [
#"value__agg_linear_trend__f_agg_\"max\"__chunk_len_5__attr_\"intercept\"",
# "value__cwt_coefficients__widths_(2, 5, 10, 20)__coeff_12__w_20",
# "value__cwt_coefficients__widths_(2, 5, 10, 20)__coeff_13__w_5",
# "value__cwt_coefficients__widths_(2, 5, 10, 20)__coeff_2__w_10",
# "value__cwt_coefficients__widths_(2, 5, 10, 20)__coeff_2__w_20",
# "value__cwt_coefficients__widths_(2, 5, 10, 20)__coeff_8__w_20",
# "value__fft_coefficient__coeff_3__attr_\"abs\"",
"time_of_day_column_x",
"time_of_day_column_y",
"value__abs_energy",
# "value__absolute_sum_of_changes",
# "value__sum_of_reoccurring_data_points",
]
clf = None
scaler = None
def __init__(self):
self.features = []
self.col_to_max, self.col_to_min, self.col_to_median = None, None, None
self.augmented_path = None
def fit(self, dataset, contamination=0.005):
dataset = dataset[self.good_features]
dataset = dataset[-100000:]
self.scaler = MinMaxScaler(feature_range=(-1, 1))
# self.clf = svm.OneClassSVM(nu=contamination, kernel="rbf", gamma=0.1)
self.clf = IsolationForest(contamination=contamination)
self.scaler.fit(dataset)
dataset = self.scaler.transform(dataset)
self.clf.fit(dataset)
def predict(self, dataframe):
dataset = dataframe[self.good_features]
dataset = self.scaler.transform(dataset)
prediction = self.clf.predict(dataset)
# for i in range(len(dataset)):
# print(str(dataset[i]) + " " + str(prediction[i]))
prediction = [x < 0.0 for x in prediction]
return pd.Series(prediction, index=dataframe.index)
def save(self, model_filename):
with open(model_filename, 'wb') as file:
pickle.dump((self.clf, self.scaler), file)
def load(self, model_filename):
with open(model_filename, 'rb') as file:
self.clf, self.scaler = pickle.load(file)
def __select_features(self, x, y):
# feature_selector = FeatureSelector()
feature_selector = FeatureSelector()
feature_selector.fit(x, y)
return feature_selector.relevant_features

131
src/worker.py

@ -0,0 +1,131 @@
from anomaly_model import AnomalyModel
from pattern_detection_model import PatternDetectionModel
import queue
import threading
import json
import logging
import sys
import traceback
logging.basicConfig(level=logging.DEBUG,
format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
filename='analytic_toolset.log',
filemode='a')
logger = logging.getLogger('analytic_toolset')
class worker(object):
models_cache = {}
thread = None
queue = queue.Queue()
def start(self):
self.thread = threading.Thread(target=self.run)
self.thread.start()
def stop(self):
if self.thread:
self.queue.put(None)
self.thread.join()
def run(self):
while True:
task = self.queue.get()
if task['type'] == "stop":
break
self.do_task(task)
self.queue.task_done()
def add_task(self, task):
self.queue.put(task)
def do_task(self, task):
try:
type = task['type']
anomaly_id = task['anomaly_id']
if type == "predict":
last_prediction_time = task['last_prediction_time']
analytics_type = task['analytics_type']
preset = None
if "preset" in task:
preset = task['preset']
result = self.do_predict(anomaly_id, last_prediction_time, analytics_type, preset)
elif type == "learn":
segments = task['segments']
analytics_type = task['analytics_type']
preset = None
if "preset" in task:
preset = task['preset']
result = self.do_learn(anomaly_id, segments, analytics_type, preset)
else:
result = {
'status': "failed",
'error': "unknown type " + str(type)
}
except Exception as e:
#traceback.extract_stack()
error_text = traceback.format_exc()
logger.error("Exception: '%s'" % error_text)
result = {
'task': type,
'status': "failed",
'anomaly_id': anomaly_id,
'error': str(e)
}
return result
def do_learn(self, anomaly_id, segments, analytics_type, preset=None):
model = self.get_model(anomaly_id, analytics_type, preset)
model.synchronize_data()
last_prediction_time = model.learn(segments)
result = self.do_predict(anomaly_id, last_prediction_time, analytics_type, preset)
result['task'] = 'learn'
return result
def do_predict(self, anomaly_id, last_prediction_time, analytics_type, preset=None):
model = self.get_model(anomaly_id, analytics_type, preset)
model.synchronize_data()
segments, last_prediction_time = model.predict(last_prediction_time)
return {
'task': "predict",
'status': "success",
'anomaly_id': anomaly_id,
'segments': segments,
'last_prediction_time': last_prediction_time
}
def get_model(self, anomaly_id, analytics_type, preset=None):
if anomaly_id not in self.models_cache:
if analytics_type == "anomalies":
model = AnomalyModel(anomaly_id)
elif analytics_type == "patterns":
model = PatternDetectionModel(anomaly_id, preset)
self.models_cache[anomaly_id] = model
return self.models_cache[anomaly_id]
if __name__ == "__main__":
w = worker()
logger.info("Worker was started")
while True:
try:
text = input("")
task = json.loads(text)
logger.info("Received command '%s'" % text)
if task['type'] == "stop":
logger.info("Stopping...")
break
print(json.dumps({
'task': task['type'],
'anomaly_id': task['anomaly_id'],
'__task_id': task['__task_id'],
'status': "in progress"
}))
sys.stdout.flush()
res = w.do_task(task)
res['__task_id'] = task['__task_id']
print(json.dumps(res))
sys.stdout.flush()
except Exception as e:
logger.error("Exception: '%s'" % str(e))
Loading…
Cancel
Save