Pipeline¶
In [30]:
Copied!
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
class Pipeline:
def __init__(self):
pass
def _impute(self):
raise NotImplementedError()
def _scale(self):
raise NotImplementedError()
def _classify(self):
raise NotImplementedError()
def _split_data(self):
raise NotImplementedError()
def process(self):
raise NotImplementedError()
class FIREMAN_Pipeline(Pipeline):
def __init__(self, dataset_x, dataset_y, imputer='Simple', scaler='RandomScaler', classifier='RandomForest', scorer='report'):
self.dataset_x = dataset_x
self.dataset_y = dataset_y
self.imputer = imputer
self.scaler = scaler
self.classifier = classifier
self.scorer = scorer
def _impute(self):
if self.imputer == 'GAIN':
pass
elif self.imputer == 'Simple':
imputer = SimpleImputer()
imputed_x = imputer.fit_transform(self.dataset_x)
elif self.imputer == '':
imputed_x = self.dataset_x
else:
raise NotImplementedError()
return imputed_x
def _scale(self, x):
if self.scaler == 'RandomScaler':
scaler = StandardScaler()
scaler.fit(x)
return scaler.transform(x)
elif self.scaler == '':
return x
else:
raise NotImplementedError()
def _split_data(self, x):
if self.scorer == 'cv_score':
return x, self.dataset_y
elif self.scorer == 'report':
x_train, x_test, y_train, y_test = train_test_split(x, self.dataset_y, test_size=0.1)
return x_train, x_test, y_train, y_test
def _classify(self):
if self.classifier == 'RandomForest':
self.classifier = RandomForestClassifier()
else:
raise NotImplementedError()
def process(self):
x_missing = self._impute()
x_scaled = self._scale(x_missing)
if self.scorer=='report':
x_train, x_test, y_train, y_test = self._split_data(x_scaled)
self._classify()
self.classifier.fit(x_train, y_train)
y_predicted = self.classifier.predict(x_test)
return print(classification_report(y_test, y_predicted))
elif self.scorer=='cv_score':
x, y = self._split_data(dataset_x_scaled)
_classify()
return print(cross_val_score(self.classifier, x, y, cv=10, scoring='f1_weighted'))
else:
raise NotImplementedError()
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
class Pipeline:
def __init__(self):
pass
def _impute(self):
raise NotImplementedError()
def _scale(self):
raise NotImplementedError()
def _classify(self):
raise NotImplementedError()
def _split_data(self):
raise NotImplementedError()
def process(self):
raise NotImplementedError()
class FIREMAN_Pipeline(Pipeline):
def __init__(self, dataset_x, dataset_y, imputer='Simple', scaler='RandomScaler', classifier='RandomForest', scorer='report'):
self.dataset_x = dataset_x
self.dataset_y = dataset_y
self.imputer = imputer
self.scaler = scaler
self.classifier = classifier
self.scorer = scorer
def _impute(self):
if self.imputer == 'GAIN':
pass
elif self.imputer == 'Simple':
imputer = SimpleImputer()
imputed_x = imputer.fit_transform(self.dataset_x)
elif self.imputer == '':
imputed_x = self.dataset_x
else:
raise NotImplementedError()
return imputed_x
def _scale(self, x):
if self.scaler == 'RandomScaler':
scaler = StandardScaler()
scaler.fit(x)
return scaler.transform(x)
elif self.scaler == '':
return x
else:
raise NotImplementedError()
def _split_data(self, x):
if self.scorer == 'cv_score':
return x, self.dataset_y
elif self.scorer == 'report':
x_train, x_test, y_train, y_test = train_test_split(x, self.dataset_y, test_size=0.1)
return x_train, x_test, y_train, y_test
def _classify(self):
if self.classifier == 'RandomForest':
self.classifier = RandomForestClassifier()
else:
raise NotImplementedError()
def process(self):
x_missing = self._impute()
x_scaled = self._scale(x_missing)
if self.scorer=='report':
x_train, x_test, y_train, y_test = self._split_data(x_scaled)
self._classify()
self.classifier.fit(x_train, y_train)
y_predicted = self.classifier.predict(x_test)
return print(classification_report(y_test, y_predicted))
elif self.scorer=='cv_score':
x, y = self._split_data(dataset_x_scaled)
_classify()
return print(cross_val_score(self.classifier, x, y, cv=10, scoring='f1_weighted'))
else:
raise NotImplementedError()
In [31]:
Copied!
tep_dataset = pd.read_csv('Tennessee_Event-Driven/datasets/tep_extended_dataset_simrun1.csv.csv',index_col=False)
dataset_X = tep_dataset.drop(columns=['faultNumber', 'simulationRun', 'sample']).values
dataset_Y = tep_dataset['faultNumber'].values
no, dim = dataset_X.shape
p = 0.1
# Introduce missing data
mask = binary_sampler(1-p, no, dim)
dataset_X_missing = dataset_X.copy()
dataset_X_missing[mask == 0] = np.nan
tep_dataset = pd.read_csv('Tennessee_Event-Driven/datasets/tep_extended_dataset_simrun1.csv.csv',index_col=False)
dataset_X = tep_dataset.drop(columns=['faultNumber', 'simulationRun', 'sample']).values
dataset_Y = tep_dataset['faultNumber'].values
no, dim = dataset_X.shape
p = 0.1
# Introduce missing data
mask = binary_sampler(1-p, no, dim)
dataset_X_missing = dataset_X.copy()
dataset_X_missing[mask == 0] = np.nan
In [32]:
Copied!
tep_pipeline = FIREMAN_Pipeline(dataset_X_missing, dataset_Y)
tep_pipeline = FIREMAN_Pipeline(dataset_X_missing, dataset_Y)
In [33]:
Copied!
tep_pipeline.process()
tep_pipeline.process()
precision recall f1-score support
0 0.32 0.60 0.42 161
1 1.00 0.89 0.94 151
2 1.00 0.89 0.94 124
3 0.33 0.46 0.38 142
4 0.84 0.75 0.79 141
5 0.72 0.76 0.74 147
6 1.00 0.88 0.93 130
7 1.00 0.86 0.92 149
8 0.99 0.93 0.96 145
9 0.36 0.45 0.40 154
10 0.77 0.69 0.73 147
11 0.80 0.61 0.69 150
12 0.91 0.82 0.86 141
13 1.00 0.83 0.91 156
14 0.97 0.79 0.87 166
15 0.30 0.44 0.35 142
16 0.76 0.65 0.70 160
17 0.82 0.70 0.75 123
18 0.98 0.79 0.88 130
19 0.63 0.66 0.64 134
20 0.82 0.63 0.71 133
21 0.97 0.88 0.92 144
accuracy 0.72 3170
macro avg 0.79 0.73 0.75 3170
weighted avg 0.78 0.72 0.75 3170
example of raising NotImplementedError¶
In [35]:
Copied!
tep_pipeline = FIREMAN_Pipeline(dataset_X_missing, dataset_Y, scaler='Scaler')
tep_pipeline.process()
tep_pipeline = FIREMAN_Pipeline(dataset_X_missing, dataset_Y, scaler='Scaler')
tep_pipeline.process()
--------------------------------------------------------------------------- NotImplementedError Traceback (most recent call last) <ipython-input-35-8b9e5322f537> in <module> 1 tep_pipeline = FIREMAN_Pipeline(dataset_X_missing, dataset_Y, scaler='Scaler') ----> 2 tep_pipeline.process() <ipython-input-30-c048450cec64> in process(self) 79 def process(self): 80 x_missing = self._impute() ---> 81 x_scaled = self._scale(x_missing) 82 if self.scorer=='report': 83 x_train, x_test, y_train, y_test = self._split_data(x_scaled) <ipython-input-30-c048450cec64> in _scale(self, x) 61 62 else: ---> 63 raise NotImplementedError() 64 65 def _split_data(self, x): NotImplementedError: