Source code for tsad.base.datasets

import pandas as pd

from dataclasses import dataclass


[docs]@dataclass class Dataset(): name: str description: str task: str frame: pd.DataFrame | list[pd.DataFrame] | list[list[pd.DataFrame]] target: pd.DataFrame | list[pd.DataFrame] | list[list[pd.DataFrame]] feature_names: list target_names: list
[docs]def list_of_datasets(): ''' Shows the list of available for import datasets. Returns ------- list_of_datasets : dict ''' list_of_datasets = {'Combines state monitoring':'load_combines()', 'SKAB (skoltech anomaly benchmark) teaser':'load_skab_teaser()', 'SKAB (skoltech anomaly benchmark)':'load_skab()', 'NASA Turbofan Jet Engine Data Set':'load_turbofan_jet_engine()', 'TEP (Tennessee Eastman process)':'load_tep()', 'Pressurized Water Reactor (PWR) Dataset for Fault Detection':'load_pwr_anomalies()', 'NPP Power Transformer RUL':'load_transformer_rul()', 'Exhauster Fault Detection dataset':'load_exhauster_faults()'} return list_of_datasets
[docs]def load_combines() -> Dataset: ''' Loads and slightly preprocesses raw data of Combines dataset. Returns ------- list_of_datasets : list References ---------- L-BFGS-B -- Software for Large-scale Bound-constrained Optimization Ciyou Zhu, Richard Byrd, Jorge Nocedal and Jose Luis Morales. http://users.iems.northwestern.edu/~nocedal/lbfgsb.html ''' url = 'https://www.dropbox.com/scl/fi/4dqcr9sdyc6z91925e0yq/data.xls?dl=1&rlkey=1rlgka6ngn7lpja8869flz1m1' frame = pd.read_excel(url, skiprows=2)\ .pivot_table(values='Значение', index='Время', columns='Описание') name = 'Combines state monitoring' description = '' task = '' target_names=None return Dataset(name=name, description=description, task=task, frame=frame, target=None, feature_names=list(frame.columns), target_names=target_names)
[docs]def load_skab_teaser() -> Dataset: ''' Loads and slightly preprocesses raw data of SKAB (skoltech anomaly benchmark) teaser. Returns ------- Dataset A dataset object with the following structure: name : str description : str task : str frame: list[pd.DataFrame] feature_names : list target_names : list References ---------- SKAB - Skoltech Anomaly Benchmark | teaser Iurii Katser and Viacheslav Kozitsin. https://www.kaggle.com/datasets/yuriykatser/skoltech-anomaly-benchmark-skab-teaser ''' # X url='https://drive.google.com/file/d/1Gtz3LJLxoyHLatV_d07Pny5wKHGGsbj3/view?usp=share_link' url='https://drive.google.com/uc?id=' + url.split('/')[-2] frame = pd.read_csv(url, sep=';', parse_dates=['datetime'])\ .pivot_table(values='value', index='datetime', columns='id') # y_test y_test = [('2019-07-08 18:39:22', '2019-07-08 18:42:32'), ('2019-07-08 18:44:36', '2019-07-08 18:46:51'), ('2019-07-08 19:06:57', '2019-07-08 19:11:31'), ('2019-07-08 19:14:40', '2019-07-08 19:21:16')] name = 'SKAB (skoltech anomaly benchmark) teaser' description = 'Dataset for process monitoring (changepoint detection) benchmarking. It is just a short version (teaser) of SKAB' task = 'Process monitoring (changepoint detection)' target_names=None return Dataset(name=name, description=description, task=task, frame=[frame, y_test], target=None, feature_names=list(frame.columns), target_names=target_names)
[docs]def load_skab() -> Dataset: ''' Loads and slightly preprocesses raw data of SKAB (skoltech anomaly benchmark). Returns ------- Dataset A dataset object with the following structure: name : str description : str task : str frame: pd.DataFrame feature_names : list target_names : list References ---------- Skoltech anomaly benchmark (skab). Katser, Iurii D., and Vyacheslav O. Kozitsin. Kaggle (2020). https://www.kaggle.com/dsv/1693952 ''' url='https://drive.google.com/file/d/1_aeGB3M3CNSEqYPxHPuGK0ju6hMuJUoK/view?usp=share_link' url='https://drive.google.com/uc?id=' + url.split('/')[-2] frame = pd.read_csv(url, sep=',', parse_dates=['datetime']) frame.set_index(['experiment', 'datetime'], inplace=True) name = 'SKAB (skoltech anomaly benchmark)' description = 'Dataset for process monitoring (changepoint detection) benchmarking' task = 'Process monitoring (changepoint detection)' feature_names = ['Accelerometer1RMS', 'Accelerometer2RMS', 'Current', 'Pressure', 'Temperature', 'Thermocouple', 'Voltage', 'Volume Flow RateRMS'] target_names = ['anomaly', 'changepoint'] return Dataset(name=name, description=description, task=task, frame=frame, target=None, feature_names=feature_names, target_names=target_names)
[docs]def load_turbofan_jet_engine() -> Dataset: ''' Loads and slightly preprocesses raw data of NASA Turbofan Jet Engine Data Set. Returns ------- Dataset A dataset object with the following structure: name : str description : str task : str frame: list[pd.DataFrame] feature_names : list target_names : list References ---------- Damage Propagation Modeling for Aircraft Engine Run-to-Failure Simulation A. Saxena, K. Goebel, D. Simon, and N. Eklund. in the Proceedings of the 1st International Conference on Prognostics and Health Management (PHM08), Denver CO, Oct 2008. https://www.kaggle.com/datasets/behrad3d/nasa-cmaps ''' feature_names = ['id','cycle','setting1','setting2','setting3','s1','s2','s3','s4','s5','s6','s7','s8','s9','s10','s11','s12','s13','s14','s15','s16','s17','s18','s19','s20','s21'] target_names = ['ttf'] # X_train and y_train url = 'http://azuremlsamples.azureml.net/templatedata/PM_train.txt' frame_train = pd.read_csv(url, sep = ' ', header=None) frame_train.drop([26,27], axis=1, inplace=True) frame_train.columns = feature_names # X_test url = 'http://azuremlsamples.azureml.net/templatedata/PM_test.txt' frame_test = pd.read_csv(url, sep = ' ', header=None) frame_test.drop([26,27], axis=1, inplace=True) frame_test.columns = feature_names # y_test url = 'http://azuremlsamples.azureml.net/templatedata/PM_truth.txt' y_test = pd.read_csv(url, sep = ' ', header=None) y_test.drop([1], axis=1, inplace=True) y_test.columns = target_names name = 'NASA Turbofan Jet Engine Data Set' description = '''Dataset includes Run-to-Failure simulated data from turbo fan jet engines. In this dataset the goal is to predict the remaining useful life (RUL) of each engine in the test dataset. RUL is equivalent of number of flights remained for the engine after the last datapoint in the test dataset. - In train dataset there are 100 engines. The last cycle for each engine represents the cycle when failure had happened. - In test dataset there are 100 engines as well. But this time, failure cycle was not provided.''' task = 'Remaining useful life prediction' return Dataset(name=name, description=description, task=task, frame=[frame_train, frame_test, y_test], target=None, feature_names=feature_names, target_names=target_names)
[docs]def load_tep() -> Dataset: ''' Loads and slightly preprocesses raw data of TEP (Tennessee Eastman process) dataset. Returns ------- Dataset A dataset object with the following structure: name : str description : str task : str frame: pd.DataFrame feature_names : list target_names : list References ---------- Damage Propagation Modeling for Aircraft Engine Run-to-Failure Simulation Professor Richard Braatz. Large Scale Systems Research Laboratory. https://github.com/YKatser/CPDE/tree/master/TEP_data ''' url='https://drive.google.com/file/d/1zQq2TDKv0fBvXrDwkr9S08k3a3RNPDHO/view?usp=sharing' url='https://drive.google.com/uc?id=' + url.split('/')[-2] frame = pd.read_csv(url, sep=',').rename(columns={'Unnamed: 0':'index'}) frame.set_index(['experiment', 'index'], inplace=True) name = 'TEP (Tennessee Eastman process)' description = 'Each training data file contains 480 rows and 52 columns and each testing data file contains 960 rows and 52 columns. An observation vector at a particular time instant is given by x=[XMEAS(1), XMEAS(2), ..., XMEAS(41), XMV(1), ..., XMV(11)]^T where XMEAS(n) is the n-th measured variable and XMV(n) is the n-th manipulated variable.' task = 'Outlier detection' target_names=None return Dataset(name=name, description=description, task=task, frame=frame, target=None, feature_names=list(frame.columns), target_names=target_names)
[docs]def load_pwr_anomalies() -> Dataset: ''' Loads and slightly preprocesses raw data of Pressurized Water Reactor (PWR) Dataset. Returns ------- Dataset A dataset object with the following structure: name : str description : str task : str frame: pd.DataFrame feature_names : list target_names : list References ---------- Pressurized Water Reactor (PWR) Dataset for Fault Detection ENGR. MUSHFIQUR RASHID KHAN https://www.kaggle.com/datasets/prottoymushfiq/pressurized-water-reactor-abnormality-dataset ''' url='https://drive.google.com/file/d/1JjPzjqU9QWoFvrlEizoJvqTT6n0OVgNN/view?usp=share_link' url='https://drive.google.com/uc?id=' + url.split('/')[-2] frame = pd.read_csv(url, sep=',', index_col='Readings') name = 'Pressurized Water Reactor (PWR) Dataset for Fault Detection' description = 'Our collected dataset is benchmark data in case of reactor abnormalities detection with labels. There are 267 readings from 14 sensors of three categories: a temperature sensor, pressure sensor, and vibration sensor (including ionization chamber, accelerometer, and relative displacement sensors). This particular dataset can be utilized in the case of unsupervised abnormality detection.' task = 'Anomaly detection' target_names=None return Dataset(name=name, description=description, task=task, frame=frame, target=None, feature_names=list(frame.columns), target_names=target_names)
[docs]def load_transformer_rul() -> Dataset: ''' Loads and slightly preprocesses raw data of NPP Power Transformer. Returns ------- Dataset A dataset object with the following structure: name : str description : str task : str frame: list[pd.DataFrame] feature_names : list target_names : list References ---------- Machine Learning Methods for Anomaly Detection in Nuclear Power Plant Power Transformers. Katser, Iurii, et al. arXiv preprint arXiv:2211.11013 (2022). ''' url='https://drive.google.com/file/d/1_aeGB3M3CNSEqYPxHPuGK0ju6hMuJUoK/view?usp=share_link' url='https://drive.google.com/uc?id=' + url.split('/')[-2] frame = pd.read_csv(url, sep=',', parse_dates=['datetime']) frame.set_index(['experiment', 'datetime'], inplace=True) # X_train url='https://drive.google.com/file/d/1NSbmnIGE5foofxOCd-tQIlbnhAjSjvZX/view?usp=share_link' url='https://drive.google.com/uc?id=' + url.split('/')[-2] X_train = pd.read_csv(url, sep=',').rename(columns={'Unnamed: 0':'id', 'Unnamed: 1':'time point'}) X_train.set_index(['id', 'time point'], inplace=True) # X_test url='https://drive.google.com/file/d/1cb7uxJ3wmAZsGyzK1ZhW_S_sUU_JGqjJ/view?usp=share_link' url='https://drive.google.com/uc?id=' + url.split('/')[-2] X_test = pd.read_csv(url, sep=',').rename(columns={'Unnamed: 0':'id', 'Unnamed: 1':'time point'}) X_test.set_index(['id', 'time point'], inplace=True) # y_train url='https://drive.google.com/file/d/17akYhUR6R2qhc2PU8OCm9Alrp4sKxitC/view?usp=share_link' url='https://drive.google.com/uc?id=' + url.split('/')[-2] y_train = pd.read_csv(url, sep=',', index_col='id') # y_test url='https://drive.google.com/file/d/1-NUEm1yiAEdr42JXBbIwXx0tauWyGVvA/view?usp=share_link' url='https://drive.google.com/uc?id=' + url.split('/')[-2] y_test = pd.read_csv(url, sep=',', index_col='id') name = 'NPP Power Transformer RUL' description = '''Dataset for Determining the Remaining Useful Life of Transformers. It is necessary to create a mathematical model that will determine RUL by the final 420 points. The period between time points is 12 hours.''' task = 'Remaining useful life prediction' feature_names = ['H2', 'CO', 'C2H4', 'C2H2'] target_names = ['predicted'] return Dataset(name=name, description=description, task=task, frame=[X_train, X_test, y_train, y_test], target=None, feature_names=feature_names, target_names=target_names)
[docs]def load_exhauster_faults(equipment_number=1) -> Dataset: ''' Loads and slightly preprocesses raw data of Exhauster data. Telemetry Time Series Dataset for Fault Detection of Exhauster sintering machines. Parameters: ----------- equipment_number : int Number of equipment (dataset) to load data for. Possible values are {1, 2, 3, 4, 5, 6} Returns ------- Dataset A dataset object with the following structure: name : str description : str task : str frame: pd.DataFrame target: pd.DataFrame feature_names : list target_names : list References ---------- ''' get_link = 'https://getfile.dokpub.com/yandex/get/' links = { 1 : 'https://disk.yandex.ru/d/NUPXNgancdY-kw', 2 : 'https://disk.yandex.ru/d/16moKa1JyR5UOg', 3 : 'https://disk.yandex.ru/d/_pIk_m2DVdqYXw', 4 : 'https://disk.yandex.ru/d/KKnuyNbDMf2kyA', 5 : 'https://disk.yandex.ru/d/LWsbxXmJFC0hbg', 6 : 'https://disk.yandex.ru/d/IOerYX7eQuyB8Q', } url = get_link + links[equipment_number] frame = pd.read_parquet(url) target_names = ['anomaly', 'anomaly_category'] target = frame[target_names] frame = frame.drop(columns=target_names) name = 'Exhauster Fault Detection dataset' description = 'Telemetry Time Series Dataset for Fault Detection of Exhauster sintering machines.' task = 'Process monitoring (changepoint detection)' feature_names = ['Rotor Current 1', 'Rotor Current 2', 'Stator Current', 'Oil Pressure in System', 'Bearing Temperature on Support 1', 'Bearing Temperature on Support 2', 'Bearing Temperature on Support 3', 'Bearing Temperature on Support 4', 'Oil Temperature in System', 'Oil Temperature in Oil Block', 'Vibration on Support 1', 'Vibration on Support 2', 'Vibration on Support 3', 'Vibration on Support 3. Longitudinal.', 'Vibration on Support 4', 'Vibration on Support 4. Longitudinal.', 'anomaly', 'anomaly_category'] return Dataset(name=name, description=description, task=task, frame=frame, feature_names=feature_names, target=target, target_names=target_names)