Preprocessing of Extended TEP dataset¶
Extended TEP dataset has 500 simulation runs and is available in the rdata format at:
https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/6C3JR1
In [2]:
Copied!
import pandas as pd
import numpy as np
import pyreadr
import pandas as pd
import numpy as np
import pyreadr
In [4]:
Copied!
train_normal_path = '#datasets/Tennessee_Event-Driven/TEP_FaultFree_Training.RData'
train_faulty_path = '#datasets/Tennessee_Event-Driven/TEP_Faulty_Training.RData'
test_normal_path = '#datasets/Tennessee_Event-Driven/TEP_FaultFree_Testing.RData'
test_faulty_path = '#datasets/Tennessee_Event-Driven/TEP_Faulty_Testing.RData'
train_normal_complete = pyreadr.read_r(train_normal_path)['fault_free_training']
train_faulty_complete = pyreadr.read_r(train_faulty_path)['faulty_training']
test_normal_complete = pyreadr.read_r(test_normal_path)['fault_free_testing']
test_faulty_complete = pyreadr.read_r(test_faulty_path)['faulty_testing']
train_normal_path = '#datasets/Tennessee_Event-Driven/TEP_FaultFree_Training.RData'
train_faulty_path = '#datasets/Tennessee_Event-Driven/TEP_Faulty_Training.RData'
test_normal_path = '#datasets/Tennessee_Event-Driven/TEP_FaultFree_Testing.RData'
test_faulty_path = '#datasets/Tennessee_Event-Driven/TEP_Faulty_Testing.RData'
train_normal_complete = pyreadr.read_r(train_normal_path)['fault_free_training']
train_faulty_complete = pyreadr.read_r(train_faulty_path)['faulty_training']
test_normal_complete = pyreadr.read_r(test_normal_path)['fault_free_testing']
test_faulty_complete = pyreadr.read_r(test_faulty_path)['faulty_testing']
In [5]:
Copied!
tep_train_dataset = pd.concat([train_normal_complete, train_faulty_complete], ignore_index=True, sort=False)
tep_test_dataset = pd.concat([test_normal_complete, test_faulty_complete], ignore_index=True, sort=False)
tep_train_dataset = pd.concat([train_normal_complete, train_faulty_complete], ignore_index=True, sort=False)
tep_test_dataset = pd.concat([test_normal_complete, test_faulty_complete], ignore_index=True, sort=False)
--------------------------------------------------------------------------- MemoryError Traceback (most recent call last) <ipython-input-5-1f362ef4040b> in <module> 1 tep_train_dataset = pd.concat([train_normal_complete, train_faulty_complete], ignore_index=True, sort=False) ----> 2 tep_test_dataset = pd.concat([test_normal_complete, test_faulty_complete], ignore_index=True, sort=False) ~/miniconda3/lib/python3.8/site-packages/pandas/core/reshape/concat.py in concat(objs, axis, join, ignore_index, keys, levels, names, verify_integrity, sort, copy) 285 ) 286 --> 287 return op.get_result() 288 289 ~/miniconda3/lib/python3.8/site-packages/pandas/core/reshape/concat.py in get_result(self) 500 mgrs_indexers.append((obj._mgr, indexers)) 501 --> 502 new_data = concatenate_block_managers( 503 mgrs_indexers, self.new_axes, concat_axis=self.bm_axis, copy=self.copy, 504 ) ~/miniconda3/lib/python3.8/site-packages/pandas/core/internals/concat.py in concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy) 77 else: 78 b = make_block( ---> 79 _concatenate_join_units(join_units, concat_axis, copy=copy,), 80 placement=placement, 81 ) ~/miniconda3/lib/python3.8/site-packages/pandas/core/internals/concat.py in _concatenate_join_units(join_units, concat_axis, copy) 344 concat_values = np.atleast_2d(concat_values) 345 else: --> 346 concat_values = concat_compat(to_concat, axis=concat_axis,) 347 348 return concat_values ~/miniconda3/lib/python3.8/site-packages/pandas/core/dtypes/concat.py in concat_compat(to_concat, axis) 178 to_concat = [x.astype("object") for x in to_concat] 179 --> 180 return np.concatenate(to_concat, axis=axis) 181 182 <__array_function__ internals> in concatenate(*args, **kwargs) MemoryError: Unable to allocate 3.91 GiB for an array with shape (52, 10080000) and data type float64
In [9]:
Copied!
tep_train_dataset['sample'] = range(len(tep_train_dataset))
tep_train_dataset['faultNumber'] = tep_train_dataset['faultNumber'].astype(int)
tep_train_dataset['simulationRun'] = tep_train_dataset['simulationRun'].astype(int)
tep_test_dataset['sample'] = range(len(tep_test_dataset))
tep_test_dataset['faultNumber'] = tep_test_dataset['faultNumber'].astype(int)
tep_test_dataset['simulationRun'] = tep_test_dataset['simulationRun'].astype(int)
tep_train_dataset['sample'] = range(len(tep_train_dataset))
tep_train_dataset['faultNumber'] = tep_train_dataset['faultNumber'].astype(int)
tep_train_dataset['simulationRun'] = tep_train_dataset['simulationRun'].astype(int)
tep_test_dataset['sample'] = range(len(tep_test_dataset))
tep_test_dataset['faultNumber'] = tep_test_dataset['faultNumber'].astype(int)
tep_test_dataset['simulationRun'] = tep_test_dataset['simulationRun'].astype(int)
In [ ]:
Copied!
# take only single run of the simulation
# tep_train_dataset = tep_train_dataset[tep_train_dataset["simulationRun"]==1]
# tep_test_dataset = tep_test_dataset[tep_test_dataset["simulationRun"]==1]
# drop columns that are not needed
# tep_train_dataset = tep_train_dataset.drop(columns=["simulationRun","sample"])
# tep_test_dataset = tep_test_dataset.drop(columns=["simulationRun","sample"])
# take only single run of the simulation
# tep_train_dataset = tep_train_dataset[tep_train_dataset["simulationRun"]==1]
# tep_test_dataset = tep_test_dataset[tep_test_dataset["simulationRun"]==1]
# drop columns that are not needed
# tep_train_dataset = tep_train_dataset.drop(columns=["simulationRun","sample"])
# tep_test_dataset = tep_test_dataset.drop(columns=["simulationRun","sample"])
In [16]:
Copied!
tep_train_dataset.to_csv('#datasets/Tennessee_Event-Driven/tep_train_extended.csv',index=False)
tep_test_dataset.to_csv('#datasets/Tennessee_Event-Driven/tep_test_extended.csv',index=False)
tep_train_dataset.to_csv('#datasets/Tennessee_Event-Driven/tep_train_extended.csv',index=False)
tep_test_dataset.to_csv('#datasets/Tennessee_Event-Driven/tep_test_extended.csv',index=False)