You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
63 lines
2.1 KiB
63 lines
2.1 KiB
from itertools import chain |
|
import pandas as pd |
|
import numpy as np |
|
from typing import Generator |
|
|
|
def prepare_data(data: list) -> pd.DataFrame: |
|
""" |
|
Takes list |
|
- converts it into pd.DataFrame, |
|
- converts 'timestamp' column to pd.Datetime, |
|
- subtracts min value from the dataset |
|
""" |
|
data = pd.DataFrame(data, columns=['timestamp', 'value']) |
|
data['timestamp'] = pd.to_datetime(data['timestamp'], unit='ms') |
|
data.fillna(value = np.nan, inplace = True) |
|
return data |
|
|
|
def get_intersected_chunks(data: list, intersection: int, chunk_size: int) -> Generator[list, None, None]: |
|
""" |
|
Returns generator that splits dataframe on intersected segments. |
|
Intersection makes it able to detect pattern that present in dataframe on the border between chunks. |
|
intersection - length of intersection. |
|
chunk_size - length of chunk |
|
""" |
|
assert chunk_size > 0, 'chunk size must be great than zero' |
|
assert intersection > 0, 'intersection length must be great than zero' |
|
|
|
data_len = len(data) |
|
|
|
if data_len <= chunk_size: |
|
yield data |
|
return |
|
|
|
nonintersected = chunk_size - intersection |
|
|
|
offset = 0 |
|
while True: |
|
left_values = data_len - offset |
|
if left_values == 0: |
|
break |
|
if left_values <= chunk_size: |
|
yield data[offset : data_len] |
|
break |
|
else: |
|
yield data[offset: offset + chunk_size] |
|
offset += min(nonintersected, left_values) |
|
|
|
def get_chunks(data: list, chunk_size: int) -> Generator[list, None, None]: |
|
""" |
|
Returns generator that splits dataframe on non-intersected segments. |
|
chunk_size - length of chunk |
|
""" |
|
assert chunk_size > 0, 'chunk size must be great than zero' |
|
|
|
chunks_iterables = [iter(data)] * chunk_size |
|
result_chunks = zip(*chunks_iterables) |
|
partial_chunk_len = len(data) % chunk_size |
|
|
|
if partial_chunk_len != 0: |
|
result_chunks = chain(result_chunks, [data[-partial_chunk_len:]]) |
|
|
|
for chunk in result_chunks: |
|
yield list(chunk)
|
|
|