import pandas as pd
from dataclasses import dataclass
[docs]@dataclass
class Dataset():
name: str
description: str
task: str
frame: pd.DataFrame | list[pd.DataFrame] | list[list[pd.DataFrame]]
target: pd.DataFrame | list[pd.DataFrame] | list[list[pd.DataFrame]]
feature_names: list
target_names: list
[docs]def list_of_datasets():
'''
Shows the list of available for import datasets.
Returns
-------
list_of_datasets : dict
'''
list_of_datasets = {'Combines state monitoring':'load_combines()',
'SKAB (skoltech anomaly benchmark) teaser':'load_skab_teaser()',
'SKAB (skoltech anomaly benchmark)':'load_skab()',
'NASA Turbofan Jet Engine Data Set':'load_turbofan_jet_engine()',
'TEP (Tennessee Eastman process)':'load_tep()',
'Pressurized Water Reactor (PWR) Dataset for Fault Detection':'load_pwr_anomalies()',
'NPP Power Transformer RUL':'load_transformer_rul()',
'Exhauster Fault Detection dataset':'load_exhauster_faults()'}
return list_of_datasets
[docs]def load_combines() -> Dataset:
'''
Loads and slightly preprocesses raw data of Combines dataset.
Returns
-------
list_of_datasets : list
References
----------
L-BFGS-B -- Software for Large-scale Bound-constrained Optimization
Ciyou Zhu, Richard Byrd, Jorge Nocedal and Jose Luis Morales.
http://users.iems.northwestern.edu/~nocedal/lbfgsb.html
'''
url = 'https://www.dropbox.com/scl/fi/4dqcr9sdyc6z91925e0yq/data.xls?dl=1&rlkey=1rlgka6ngn7lpja8869flz1m1'
frame = pd.read_excel(url, skiprows=2)\
.pivot_table(values='Значение', index='Время', columns='Описание')
name = 'Combines state monitoring'
description = ''
task = ''
target_names=None
return Dataset(name=name, description=description, task=task, frame=frame, target=None, feature_names=list(frame.columns), target_names=target_names)
[docs]def load_skab_teaser() -> Dataset:
'''
Loads and slightly preprocesses raw data of SKAB (skoltech anomaly benchmark) teaser.
Returns
-------
Dataset
A dataset object with the following structure:
name : str
description : str
task : str
frame: list[pd.DataFrame]
feature_names : list
target_names : list
References
----------
SKAB - Skoltech Anomaly Benchmark | teaser
Iurii Katser and Viacheslav Kozitsin.
https://www.kaggle.com/datasets/yuriykatser/skoltech-anomaly-benchmark-skab-teaser
'''
# X
url='https://drive.google.com/file/d/1Gtz3LJLxoyHLatV_d07Pny5wKHGGsbj3/view?usp=share_link'
url='https://drive.google.com/uc?id=' + url.split('/')[-2]
frame = pd.read_csv(url, sep=';', parse_dates=['datetime'])\
.pivot_table(values='value', index='datetime', columns='id')
# y_test
y_test = [('2019-07-08 18:39:22', '2019-07-08 18:42:32'),
('2019-07-08 18:44:36', '2019-07-08 18:46:51'),
('2019-07-08 19:06:57', '2019-07-08 19:11:31'),
('2019-07-08 19:14:40', '2019-07-08 19:21:16')]
name = 'SKAB (skoltech anomaly benchmark) teaser'
description = 'Dataset for process monitoring (changepoint detection) benchmarking. It is just a short version (teaser) of SKAB'
task = 'Process monitoring (changepoint detection)'
target_names=None
return Dataset(name=name, description=description, task=task, frame=[frame, y_test], target=None, feature_names=list(frame.columns), target_names=target_names)
[docs]def load_skab() -> Dataset:
'''
Loads and slightly preprocesses raw data of SKAB (skoltech anomaly benchmark).
Returns
-------
Dataset
A dataset object with the following structure:
name : str
description : str
task : str
frame: pd.DataFrame
feature_names : list
target_names : list
References
----------
Skoltech anomaly benchmark (skab).
Katser, Iurii D., and Vyacheslav O. Kozitsin. Kaggle (2020).
https://www.kaggle.com/dsv/1693952
'''
url='https://drive.google.com/file/d/1_aeGB3M3CNSEqYPxHPuGK0ju6hMuJUoK/view?usp=share_link'
url='https://drive.google.com/uc?id=' + url.split('/')[-2]
frame = pd.read_csv(url, sep=',', parse_dates=['datetime'])
frame.set_index(['experiment', 'datetime'], inplace=True)
name = 'SKAB (skoltech anomaly benchmark)'
description = 'Dataset for process monitoring (changepoint detection) benchmarking'
task = 'Process monitoring (changepoint detection)'
feature_names = ['Accelerometer1RMS', 'Accelerometer2RMS', 'Current', 'Pressure',
'Temperature', 'Thermocouple', 'Voltage', 'Volume Flow RateRMS']
target_names = ['anomaly', 'changepoint']
return Dataset(name=name, description=description, task=task,
frame=frame, target=None, feature_names=feature_names, target_names=target_names)
[docs]def load_turbofan_jet_engine() -> Dataset:
'''
Loads and slightly preprocesses raw data of NASA Turbofan Jet Engine Data Set.
Returns
-------
Dataset
A dataset object with the following structure:
name : str
description : str
task : str
frame: list[pd.DataFrame]
feature_names : list
target_names : list
References
----------
Damage Propagation Modeling for Aircraft Engine Run-to-Failure Simulation
A. Saxena, K. Goebel, D. Simon, and N. Eklund. in the Proceedings of the 1st International Conference on Prognostics and Health Management (PHM08), Denver CO, Oct 2008.
https://www.kaggle.com/datasets/behrad3d/nasa-cmaps
'''
feature_names = ['id','cycle','setting1','setting2','setting3','s1','s2','s3','s4','s5','s6','s7','s8','s9','s10','s11','s12','s13','s14','s15','s16','s17','s18','s19','s20','s21']
target_names = ['ttf']
# X_train and y_train
url = 'http://azuremlsamples.azureml.net/templatedata/PM_train.txt'
frame_train = pd.read_csv(url, sep = ' ', header=None)
frame_train.drop([26,27], axis=1, inplace=True)
frame_train.columns = feature_names
# X_test
url = 'http://azuremlsamples.azureml.net/templatedata/PM_test.txt'
frame_test = pd.read_csv(url, sep = ' ', header=None)
frame_test.drop([26,27], axis=1, inplace=True)
frame_test.columns = feature_names
# y_test
url = 'http://azuremlsamples.azureml.net/templatedata/PM_truth.txt'
y_test = pd.read_csv(url, sep = ' ', header=None)
y_test.drop([1], axis=1, inplace=True)
y_test.columns = target_names
name = 'NASA Turbofan Jet Engine Data Set'
description = '''Dataset includes Run-to-Failure simulated data from turbo fan jet engines. In this dataset the goal is to predict the remaining useful life (RUL) of each engine in the test dataset. RUL is equivalent of number of flights remained for the engine after the last datapoint in the test dataset.
- In train dataset there are 100 engines. The last cycle for each engine represents the cycle when failure had happened.
- In test dataset there are 100 engines as well. But this time, failure cycle was not provided.'''
task = 'Remaining useful life prediction'
return Dataset(name=name, description=description, task=task, frame=[frame_train, frame_test, y_test], target=None,
feature_names=feature_names, target_names=target_names)
[docs]def load_tep() -> Dataset:
'''
Loads and slightly preprocesses raw data of TEP (Tennessee Eastman process) dataset.
Returns
-------
Dataset
A dataset object with the following structure:
name : str
description : str
task : str
frame: pd.DataFrame
feature_names : list
target_names : list
References
----------
Damage Propagation Modeling for Aircraft Engine Run-to-Failure Simulation
Professor Richard Braatz. Large Scale Systems Research Laboratory.
https://github.com/YKatser/CPDE/tree/master/TEP_data
'''
url='https://drive.google.com/file/d/1zQq2TDKv0fBvXrDwkr9S08k3a3RNPDHO/view?usp=sharing'
url='https://drive.google.com/uc?id=' + url.split('/')[-2]
frame = pd.read_csv(url, sep=',').rename(columns={'Unnamed: 0':'index'})
frame.set_index(['experiment', 'index'], inplace=True)
name = 'TEP (Tennessee Eastman process)'
description = 'Each training data file contains 480 rows and 52 columns and each testing data file contains 960 rows and 52 columns. An observation vector at a particular time instant is given by x=[XMEAS(1), XMEAS(2), ..., XMEAS(41), XMV(1), ..., XMV(11)]^T where XMEAS(n) is the n-th measured variable and XMV(n) is the n-th manipulated variable.'
task = 'Outlier detection'
target_names=None
return Dataset(name=name, description=description, task=task, frame=frame, target=None, feature_names=list(frame.columns), target_names=target_names)
[docs]def load_pwr_anomalies() -> Dataset:
'''
Loads and slightly preprocesses raw data of Pressurized Water Reactor (PWR) Dataset.
Returns
-------
Dataset
A dataset object with the following structure:
name : str
description : str
task : str
frame: pd.DataFrame
feature_names : list
target_names : list
References
----------
Pressurized Water Reactor (PWR) Dataset for Fault Detection
ENGR. MUSHFIQUR RASHID KHAN
https://www.kaggle.com/datasets/prottoymushfiq/pressurized-water-reactor-abnormality-dataset
'''
url='https://drive.google.com/file/d/1JjPzjqU9QWoFvrlEizoJvqTT6n0OVgNN/view?usp=share_link'
url='https://drive.google.com/uc?id=' + url.split('/')[-2]
frame = pd.read_csv(url, sep=',', index_col='Readings')
name = 'Pressurized Water Reactor (PWR) Dataset for Fault Detection'
description = 'Our collected dataset is benchmark data in case of reactor abnormalities detection with labels. There are 267 readings from 14 sensors of three categories: a temperature sensor, pressure sensor, and vibration sensor (including ionization chamber, accelerometer, and relative displacement sensors). This particular dataset can be utilized in the case of unsupervised abnormality detection.'
task = 'Anomaly detection'
target_names=None
return Dataset(name=name, description=description, task=task, frame=frame, target=None, feature_names=list(frame.columns), target_names=target_names)
[docs]def load_exhauster_faults(equipment_number=1) -> Dataset:
'''
Loads and slightly preprocesses raw data of Exhauster data.
Telemetry Time Series Dataset for Fault Detection of Exhauster sintering machines.
Parameters:
-----------
equipment_number : int
Number of equipment (dataset) to load data for.
Possible values are {1, 2, 3, 4, 5, 6}
Returns
-------
Dataset
A dataset object with the following structure:
name : str
description : str
task : str
frame: pd.DataFrame
target: pd.DataFrame
feature_names : list
target_names : list
References
----------
'''
get_link = 'https://getfile.dokpub.com/yandex/get/'
links = {
1 : 'https://disk.yandex.ru/d/NUPXNgancdY-kw',
2 : 'https://disk.yandex.ru/d/16moKa1JyR5UOg',
3 : 'https://disk.yandex.ru/d/_pIk_m2DVdqYXw',
4 : 'https://disk.yandex.ru/d/KKnuyNbDMf2kyA',
5 : 'https://disk.yandex.ru/d/LWsbxXmJFC0hbg',
6 : 'https://disk.yandex.ru/d/IOerYX7eQuyB8Q',
}
url = get_link + links[equipment_number]
frame = pd.read_parquet(url)
target_names = ['anomaly', 'anomaly_category']
target = frame[target_names]
frame = frame.drop(columns=target_names)
name = 'Exhauster Fault Detection dataset'
description = 'Telemetry Time Series Dataset for Fault Detection of Exhauster sintering machines.'
task = 'Process monitoring (changepoint detection)'
feature_names = ['Rotor Current 1', 'Rotor Current 2', 'Stator Current',
'Oil Pressure in System', 'Bearing Temperature on Support 1',
'Bearing Temperature on Support 2', 'Bearing Temperature on Support 3',
'Bearing Temperature on Support 4', 'Oil Temperature in System',
'Oil Temperature in Oil Block', 'Vibration on Support 1',
'Vibration on Support 2', 'Vibration on Support 3',
'Vibration on Support 3. Longitudinal.', 'Vibration on Support 4',
'Vibration on Support 4. Longitudinal.', 'anomaly', 'anomaly_category']
return Dataset(name=name, description=description, task=task,
frame=frame, feature_names=feature_names,
target=target, target_names=target_names)