Hastic standalone
https://hastic.io
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
467 lines
13 KiB
467 lines
13 KiB
use std::{collections::VecDeque, fmt, sync::Arc}; |
|
|
|
use futures::future; |
|
use parking_lot::Mutex; |
|
|
|
use gbdt::config::Config; |
|
use gbdt::decision_tree::Data; |
|
use gbdt::gradient_boost::GBDT; |
|
|
|
use crate::services::{ |
|
analytic_service::types::{LearningTrain, HSR}, |
|
metric_service::MetricService, |
|
segments_service::{Segment, SegmentType, SegmentsService}, |
|
}; |
|
|
|
use super::types::{AnalyticUnit, AnalyticUnitConfig, LearningResult, PatternConfig}; |
|
|
|
use async_trait::async_trait; |
|
|
|
use rustfft::{self, num_complex::Complex, FftPlanner}; |
|
|
|
// TODO: move to config |
|
const DETECTION_STEP: u64 = 10; |
|
const FFT_LEN: usize = 64; |
|
|
|
// TODO: convert to vector |
|
pub const FEATURES_SIZE: usize = 4 + 16 * 2; |
|
pub type Features = Vec<f64>; |
|
|
|
use std::f64::consts; |
|
|
|
#[derive(Clone)] |
|
pub struct LearningResults { |
|
// TODO: replace with RWLock |
|
model: Arc<Mutex<GBDT>>, |
|
|
|
pub learning_train: LearningTrain, |
|
|
|
patterns: Vec<Vec<f64>>, |
|
anti_patterns: Vec<Vec<f64>>, |
|
|
|
avg_pattern_length: usize, |
|
} |
|
|
|
impl fmt::Debug for LearningResults { |
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
|
f.debug_struct("Point") |
|
.field("{:?}", &self.patterns) |
|
.field("{:?}", &self.anti_patterns) |
|
.finish() |
|
} |
|
} |
|
|
|
// TODO: maye remove this because metric service works with interpolate_nans_and_gaps_with_zeros |
|
fn nan_to_zero(n: f64) -> f64 { |
|
if n.is_nan() { |
|
return 0.; |
|
} |
|
return n; |
|
} |
|
|
|
struct SegData { |
|
label: bool, |
|
data: Vec<(u64, f64)>, |
|
} |
|
|
|
async fn segment_to_segdata(ms: &MetricService, segment: &Segment) -> anyhow::Result<SegData> { |
|
let mut mr = ms.query(segment.from, segment.to, DETECTION_STEP).await?; |
|
|
|
if mr.data.keys().len() == 0 { |
|
return Ok(SegData { |
|
label: segment.segment_type == SegmentType::Label, |
|
data: Default::default(), |
|
}); |
|
} |
|
|
|
// TODO: unwrap -> ? |
|
let k = mr.data.keys().nth(0).unwrap().clone(); |
|
let ts = mr.data.remove(&k).unwrap(); |
|
|
|
Ok(SegData { |
|
label: segment.segment_type == SegmentType::Label, |
|
data: ts, |
|
}) |
|
} |
|
|
|
fn get_features(xs: &Vec<f64>) -> Features { |
|
let mut min = f64::MAX; |
|
let mut max = f64::MIN; |
|
let mut sum = 0f64; |
|
|
|
for x in xs { |
|
min = min.min(*x); |
|
max = max.max(*x); |
|
sum += x; |
|
} |
|
|
|
let mean = sum / xs.len() as f64; |
|
|
|
sum = 0f64; |
|
|
|
for x in xs { |
|
sum += (x - mean) * (x - mean); |
|
} |
|
|
|
let sd = sum.sqrt(); |
|
|
|
// TODO: add DWT |
|
|
|
let mut planner = FftPlanner::<f64>::new(); |
|
|
|
let fft = planner.plan_fft_forward(FFT_LEN); |
|
let mut c_buffer = vec![ |
|
Complex { |
|
re: 0.0f64, |
|
im: 0.0f64 |
|
}; |
|
FFT_LEN |
|
]; |
|
|
|
let p = 1.0 / FFT_LEN as f64; |
|
for i in 0..FFT_LEN.min(xs.len()) { |
|
c_buffer[i].re = xs[i] * consts::E.powf((i as f64) * p); |
|
} |
|
|
|
fft.process(&mut c_buffer); |
|
|
|
// https://docs.rs/rustfft/6.0.1/rustfft/index.html#normalization |
|
let norm_factor = (FFT_LEN.min(xs.len()) as f64).sqrt(); |
|
for i in 0..FFT_LEN.min(xs.len()) { |
|
c_buffer[i] /= norm_factor; |
|
} |
|
|
|
let mut fs_result = Vec::<f64>::new(); |
|
fs_result.push(min); |
|
fs_result.push(max); |
|
fs_result.push(mean); |
|
fs_result.push(sd); |
|
|
|
for i in 0..16usize { |
|
fs_result.push(c_buffer[i].re); |
|
fs_result.push(c_buffer[i].im); |
|
} |
|
|
|
return fs_result; |
|
} |
|
|
|
fn corr_aligned(xs: &VecDeque<f64>, ys: &Vec<f64>) -> f32 { |
|
let n = xs.len() as f64; |
|
let mut s_xs: f64 = 0f64; |
|
let mut s_ys: f64 = 0f64; |
|
let mut s_xsys: f64 = 0f64; |
|
let mut s_xs_2: f64 = 0f64; |
|
let mut s_ys_2: f64 = 0f64; |
|
|
|
let min = xs.len().min(ys.len()); |
|
xs.iter() |
|
.take(min) |
|
.zip(ys.iter().take(min)) |
|
.for_each(|(xi, yi)| { |
|
s_xs += xi; |
|
s_ys += yi; |
|
s_xsys += xi * yi; |
|
s_xs_2 += xi * xi; |
|
s_ys_2 += yi * yi; |
|
}); |
|
|
|
let numerator: f64 = n * s_xsys - s_xs * s_ys; |
|
let denominator: f64 = ((n * s_xs_2 - s_xs * s_xs) * (n * s_ys_2 - s_ys * s_ys)).sqrt(); |
|
|
|
// TODO: IT"s a hack, check and make it's better |
|
if denominator < 0.01 { |
|
return 0.; |
|
} |
|
|
|
let result: f64 = numerator / denominator; |
|
|
|
if result.abs() > 1.1 { |
|
println!("{:?}", xs); |
|
println!("------------"); |
|
println!("{:?}", ys); |
|
println!("WARNING: corr result > 1: {}", result); |
|
} |
|
|
|
return result as f32; // we know that it's in -1..1 |
|
} |
|
|
|
fn max_corr_with_segments(xs: &VecDeque<f64>, yss: &Vec<Vec<f64>>) -> f32 { |
|
let mut max_corr = 0.0; // we just take positive part of correlation |
|
for ys in yss.iter() { |
|
let c = corr_aligned(xs, ys); |
|
// TODO: check that here no NaNs |
|
if c > max_corr { |
|
max_corr = c; |
|
} |
|
} |
|
return max_corr; |
|
} |
|
|
|
pub struct PatternAnalyticUnit { |
|
id: String, |
|
config: PatternConfig, |
|
learning_results: Option<LearningResults>, |
|
} |
|
|
|
// TODO: move this to loginc of analytic unit |
|
impl PatternAnalyticUnit { |
|
pub fn new(id: String, cfg: PatternConfig) -> PatternAnalyticUnit { |
|
PatternAnalyticUnit { |
|
id, |
|
config: cfg, |
|
learning_results: None, |
|
} |
|
} |
|
} |
|
|
|
#[async_trait] |
|
impl AnalyticUnit for PatternAnalyticUnit { |
|
fn get_id(&self) -> String { |
|
return self.id.to_owned(); |
|
} |
|
fn get_detection_window(&self) -> u64 { |
|
let lr = self.learning_results.as_ref().unwrap(); |
|
return lr.avg_pattern_length as u64; |
|
} |
|
fn set_config(&mut self, config: AnalyticUnitConfig) { |
|
if let AnalyticUnitConfig::Pattern(cfg) = config { |
|
self.config = cfg; |
|
} else { |
|
panic!("Bad config!"); |
|
} |
|
} |
|
|
|
async fn learn( |
|
&mut self, |
|
ms: MetricService, |
|
ss: SegmentsService, |
|
) -> anyhow::Result<LearningResult> { |
|
// TODO: move to config |
|
let mut cfg = Config::new(); |
|
cfg.set_feature_size(FEATURES_SIZE); |
|
cfg.set_max_depth(3); |
|
cfg.set_iterations(50); |
|
cfg.set_shrinkage(0.1); |
|
cfg.set_loss("LogLikelyhood"); |
|
cfg.set_debug(false); |
|
cfg.set_data_sample_ratio(1.0); |
|
cfg.set_feature_sample_ratio(1.0); |
|
cfg.set_training_optimization_level(2); |
|
|
|
// be careful if decide to store detections in db |
|
let segments = ss.get_segments_inside(0, u64::MAX / 2)?; |
|
let has_segments_label = segments |
|
.iter() |
|
.find(|s| s.segment_type == SegmentType::Label) |
|
.is_some(); |
|
|
|
if !has_segments_label { |
|
return Ok(LearningResult::FinishedEmpty); |
|
} |
|
|
|
let fs = segments.iter().map(|s| segment_to_segdata(&ms, s)); |
|
let rs = future::join_all(fs).await; |
|
|
|
let mut learn_tss = Vec::new(); |
|
let mut learn_anti_tss = Vec::new(); |
|
|
|
for r in rs { |
|
if r.is_err() { |
|
// TODO: custom DatasourceError error type |
|
return Err(anyhow::format_err!( |
|
"Error extracting metrics from datasource" |
|
)); |
|
} |
|
|
|
let sd = r?; |
|
if sd.data.is_empty() { |
|
continue; |
|
} |
|
if sd.label { |
|
learn_tss.push(sd.data); |
|
} else { |
|
learn_anti_tss.push(sd.data); |
|
} |
|
} |
|
|
|
if learn_tss.len() == 0 { |
|
return Ok(LearningResult::FinishedEmpty); |
|
} |
|
|
|
let mut patterns = Vec::<Vec<f64>>::new(); |
|
let mut anti_patterns = Vec::<Vec<f64>>::new(); |
|
|
|
let mut records_raw = Vec::<Features>::new(); |
|
let mut targets_raw = Vec::<bool>::new(); |
|
|
|
let mut pattern_length_size_sum = 0usize; |
|
|
|
for r in learn_tss { |
|
let xs: Vec<f64> = r.iter().map(|e| e.1).map(nan_to_zero).collect(); |
|
let fs = get_features(&xs); |
|
|
|
records_raw.push(fs); |
|
targets_raw.push(true); |
|
pattern_length_size_sum += xs.len(); |
|
patterns.push(xs); |
|
} |
|
|
|
for r in learn_anti_tss { |
|
let xs: Vec<f64> = r.iter().map(|e| e.1).map(nan_to_zero).collect(); |
|
let fs = get_features(&xs); |
|
records_raw.push(fs); |
|
targets_raw.push(false); |
|
pattern_length_size_sum += xs.len(); |
|
anti_patterns.push(xs); |
|
} |
|
|
|
let mut train_dv = Vec::new(); |
|
assert_eq!(records_raw.len(), targets_raw.len()); |
|
|
|
for i in 0..records_raw.len() { |
|
let data = Data::new_training_data( |
|
records_raw[i].iter().map(|e| *e as f32).collect(), |
|
1.0, |
|
if targets_raw[i] { 1.0 } else { -1.0 }, |
|
Some(0.5), |
|
); |
|
// println!("{:?}", targets_raw[i]); |
|
train_dv.push(data); |
|
} |
|
|
|
let mut model = GBDT::new(&cfg); |
|
model.fit(&mut train_dv); |
|
|
|
let avg_pattern_length = pattern_length_size_sum / (&patterns.len() + &anti_patterns.len()); |
|
|
|
self.learning_results = Some(LearningResults { |
|
model: Arc::new(Mutex::new(model)), |
|
|
|
learning_train: LearningTrain { |
|
features: records_raw, |
|
target: targets_raw, |
|
}, |
|
|
|
patterns, |
|
anti_patterns, |
|
|
|
avg_pattern_length, |
|
}); |
|
|
|
return Ok(LearningResult::Finished); |
|
} |
|
|
|
// TODO: get iterator instead of vector |
|
async fn detect( |
|
&self, |
|
ms: MetricService, |
|
from: u64, |
|
to: u64, |
|
) -> anyhow::Result<Vec<(u64, u64)>> { |
|
if self.learning_results.is_none() { |
|
return Err(anyhow::format_err!("Learning results are not ready")); |
|
} |
|
|
|
let mr = ms.query(from, to, DETECTION_STEP).await.unwrap(); |
|
|
|
if mr.data.keys().len() == 0 { |
|
return Ok(Vec::new()); |
|
} |
|
|
|
let k = mr.data.keys().nth(0).unwrap(); |
|
let ts = &mr.data[k]; |
|
|
|
let lr = self.learning_results.as_ref().unwrap(); |
|
let mut results = Vec::new(); |
|
|
|
let pt = &lr.patterns; |
|
let apt = &lr.anti_patterns; |
|
|
|
if lr.avg_pattern_length > ts.len() { |
|
// TODO: handle case when we inside pattern |
|
return Ok(results); |
|
} |
|
|
|
let mut window = VecDeque::<f64>::new(); |
|
for i in 0..lr.avg_pattern_length { |
|
window.push_back(nan_to_zero(ts[i].1)); |
|
} |
|
|
|
let mut i = lr.avg_pattern_length - 1; |
|
|
|
let mut from: Option<u64> = None; |
|
let mut to: Option<u64> = None; |
|
|
|
loop { |
|
let positive_corr = max_corr_with_segments(&window, pt); |
|
let negative_corr = max_corr_with_segments(&window, apt); |
|
|
|
let model_weight = { |
|
let mut vs: Vec<f64> = Vec::new(); |
|
for v in window.iter() { |
|
vs.push(*v); |
|
} |
|
let fs = get_features(&vs).iter().map(|e| *e as f32).collect(); |
|
let lk = lr.model.lock(); |
|
|
|
let data_t1 = Data::new_test_data(fs, Some(0.5)); |
|
let mut test_dv = Vec::new(); |
|
test_dv.push(data_t1); |
|
lk.predict(&test_dv)[0] |
|
}; |
|
|
|
let score = positive_corr * self.config.correlation_score |
|
- negative_corr * self.config.anti_correlation_score |
|
+ model_weight as f32 * self.config.model_score; |
|
|
|
// TODO: replace it with score > config.score_treshold |
|
if score > self.config.threshold_score { |
|
// inside pattern |
|
if from.is_none() { |
|
from = Some(ts[i - (lr.avg_pattern_length - 1)].0); |
|
} |
|
to = Some(ts[i].0); |
|
} else { |
|
if to.is_some() { |
|
// merge with last |
|
if results.len() > 0 && results.last().unwrap().1 >= from.unwrap() { |
|
let (prev_from, _) = results.pop().unwrap(); |
|
results.push((prev_from, to.unwrap())); |
|
} else { |
|
results.push((from.unwrap(), to.unwrap())); |
|
} |
|
from = None; |
|
to = None; |
|
} |
|
} |
|
|
|
i += 1; |
|
if i == ts.len() { |
|
break; |
|
} |
|
|
|
window.pop_front(); |
|
window.push_back(ts[i].1); |
|
} |
|
|
|
if to.is_some() { |
|
results.push((from.unwrap(), to.unwrap())); |
|
} |
|
|
|
Ok(results) |
|
} |
|
|
|
// TODO: use hsr for learning and detections |
|
async fn get_hsr(&self, ms: MetricService, from: u64, to: u64) -> anyhow::Result<HSR> { |
|
let mr = ms.query(from, to, DETECTION_STEP).await.unwrap(); |
|
|
|
if mr.data.keys().len() == 0 { |
|
return Ok(HSR::TimeSerie(Vec::new())); |
|
} |
|
|
|
let k = mr.data.keys().nth(0).unwrap(); |
|
let ts = mr.data[k].clone(); |
|
|
|
Ok(HSR::TimeSerie(ts)) |
|
} |
|
}
|
|
|