|
|
|
from itertools import chain
|
|
|
|
import pandas as pd
|
|
|
|
import numpy as np
|
|
|
|
from typing import Generator
|
|
|
|
|
|
|
|
def prepare_data(data: list) -> pd.DataFrame:
|
|
|
|
"""
|
|
|
|
Takes list
|
|
|
|
- converts it into pd.DataFrame,
|
|
|
|
- converts 'timestamp' column to pd.Datetime,
|
|
|
|
- subtracts min value from the dataset
|
|
|
|
"""
|
|
|
|
data = pd.DataFrame(data, columns=['timestamp', 'value'])
|
|
|
|
data['timestamp'] = pd.to_datetime(data['timestamp'], unit='ms')
|
|
|
|
data.fillna(value = np.nan, inplace = True)
|
|
|
|
return data
|
|
|
|
|
|
|
|
def get_intersected_chunks(data: list, intersection: int, chunk_size: int) -> Generator[list, None, None]:
|
|
|
|
"""
|
|
|
|
Returns generator that splits dataframe on intersected segments.
|
|
|
|
Intersection makes it able to detect pattern that present in dataframe on the border between chunks.
|
|
|
|
intersection - length of intersection.
|
|
|
|
chunk_size - length of chunk
|
|
|
|
"""
|
|
|
|
assert chunk_size > 0, 'chunk size must be great than zero'
|
|
|
|
assert intersection > 0, 'intersection length must be great than zero'
|
|
|
|
|
|
|
|
data_len = len(data)
|
|
|
|
|
|
|
|
if data_len <= chunk_size:
|
|
|
|
yield data
|
|
|
|
return
|
|
|
|
|
|
|
|
nonintersected = chunk_size - intersection
|
|
|
|
|
|
|
|
offset = 0
|
|
|
|
while True:
|
|
|
|
left_values = data_len - offset
|
|
|
|
if left_values == 0:
|
|
|
|
break
|
|
|
|
if left_values <= chunk_size:
|
|
|
|
yield data[offset : data_len]
|
|
|
|
break
|
|
|
|
else:
|
|
|
|
yield data[offset: offset + chunk_size]
|
|
|
|
offset += min(nonintersected, left_values)
|
|
|
|
|
|
|
|
def get_chunks(data: list, chunk_size: int) -> Generator[list, None, None]:
|
|
|
|
"""
|
|
|
|
Returns generator that splits dataframe on non-intersected segments.
|
|
|
|
chunk_size - length of chunk
|
|
|
|
"""
|
|
|
|
assert chunk_size > 0, 'chunk size must be great than zero'
|
|
|
|
|
|
|
|
chunks_iterables = [iter(data)] * chunk_size
|
|
|
|
result_chunks = zip(*chunks_iterables)
|
|
|
|
partial_chunk_len = len(data) % chunk_size
|
|
|
|
|
|
|
|
if partial_chunk_len != 0:
|
|
|
|
result_chunks = chain(result_chunks, [data[-partial_chunk_len:]])
|
|
|
|
|
|
|
|
for chunk in result_chunks:
|
|
|
|
yield list(chunk)
|