Imports¶
In [1]:
Copied!
import pandas as pd
from src import pipeline
from src import common
import pandas as pd
from src import pipeline
from src import common
Dataset preprocessing¶
In [2]:
Copied!
column_types_loc = "#datasets/Colab_PowerConverter/column_types.json"
column_types = common.json_load(column_types_loc)
target = column_types["target"]
identifier = column_types["identifier"]
cat_cols = column_types["categorical"]
measurement_label = column_types["measurement_label"]
data = pd.read_pickle(
f"#datasets/Colab_PowerConverter/dataset.pkl"
)
# this measurement did not have a fault (?)
data = data[data[measurement_label]!="Single-Phase_Sensor_Fault"]
data.reset_index(inplace=True, drop=True)
# assign unique label to each measurement fault and create dictionary for easier analysis
fault_dict = {}
for label,i in zip(data[measurement_label].unique(), range(len(data[measurement_label].unique()))):
data.loc[(data[measurement_label]==label) & (data[target]==1), target] = int(i+1)
fault_dict[label] = int(i+1)
data.drop(columns=[measurement_label], inplace=True)
# Fill NA - 0 for numerical and 'NA' for categorical
# categorical
data[cat_cols] = data[cat_cols].fillna("NA")
data[cat_cols] = data[cat_cols].astype(str)
# non-categorical
non_cat_cols = data.drop(columns=cat_cols + [identifier]).columns.tolist()
data[non_cat_cols] = data[non_cat_cols].fillna(0)
column_types_loc = "#datasets/Colab_PowerConverter/column_types.json"
column_types = common.json_load(column_types_loc)
target = column_types["target"]
identifier = column_types["identifier"]
cat_cols = column_types["categorical"]
measurement_label = column_types["measurement_label"]
data = pd.read_pickle(
f"#datasets/Colab_PowerConverter/dataset.pkl"
)
# this measurement did not have a fault (?)
data = data[data[measurement_label]!="Single-Phase_Sensor_Fault"]
data.reset_index(inplace=True, drop=True)
# assign unique label to each measurement fault and create dictionary for easier analysis
fault_dict = {}
for label,i in zip(data[measurement_label].unique(), range(len(data[measurement_label].unique()))):
data.loc[(data[measurement_label]==label) & (data[target]==1), target] = int(i+1)
fault_dict[label] = int(i+1)
data.drop(columns=[measurement_label], inplace=True)
# Fill NA - 0 for numerical and 'NA' for categorical
# categorical
data[cat_cols] = data[cat_cols].fillna("NA")
data[cat_cols] = data[cat_cols].astype(str)
# non-categorical
non_cat_cols = data.drop(columns=cat_cols + [identifier]).columns.tolist()
data[non_cat_cols] = data[non_cat_cols].fillna(0)
Train model¶
In [3]:
Copied!
task = "multiclass"
column_types_loc = "#datasets/Colab_PowerConverter/column_types.json"
save_loc = "models.dill"
parameters = {
"random_state": 1,
"test_size_train": 0.2,
"test_size_valid": 0.5,
"scaler": "Standard",
}
task = "multiclass"
column_types_loc = "#datasets/Colab_PowerConverter/column_types.json"
save_loc = "models.dill"
parameters = {
"random_state": 1,
"test_size_train": 0.2,
"test_size_valid": 0.5,
"scaler": "Standard",
}
In [4]:
Copied!
(
data_train_scaled,
data_valid_scaled,
data_test_scaled,
models,
) = pipeline.train(task,data, column_types_loc, parameters, save_loc=save_loc, verbose=True, datasets=True)
(
data_train_scaled,
data_valid_scaled,
data_test_scaled,
models,
) = pipeline.train(task,data, column_types_loc, parameters, save_loc=save_loc, verbose=True, datasets=True)
Size of dataset classes: 0 597599 5 40014 3 40001 6 40001 7 40001 8 40001 9 40001 10 40001 11 40001 13 40001 1 38971 2 38971 4 3166 12 1335 Name: fault, dtype: int64
/home/palo/miniconda3/lib/python3.8/site-packages/pytorch_widedeep/preprocessing/tab_preprocessor.py:202: UserWarning: Continuous columns will not be normalised
warnings.warn("Continuous columns will not be normalised")
epoch 1: 100%|██████████| 748/748 [00:12<00:00, 59.65it/s, loss=0.0901, metrics={'Accuracy': [0.0709, 0.3708, 0.0576, 0.0845, 0.0206, 0.574, 0.0347, 0.1306, 0.086, 0.0327, 0.1312, 0.066, 0.5213, 0.1361], 'Precision': 0.1665, 'F1': [0.0846, 0.3402, 0.0585, 0.0863, 0.0343, 0.4997, 0.0449, 0.1097, 0.078, 0.0404, 0.1053, 0.0765, 0.5182, 0.1002], 'Recall': [0.0709, 0.3708, 0.0576, 0.0845, 0.0206, 0.574, 0.0347, 0.1306, 0.086, 0.0327, 0.1312, 0.066, 0.5213, 0.1361]}]
valid: 100%|██████████| 1041/1041 [00:14<00:00, 73.08it/s, loss=0.04, metrics={'Accuracy': [0.0082, 0.8353, 0.0, 0.0377, 0.1893, 1.0, 0.0, 0.2442, 0.026, 0.003, 0.0655, 0.0115, 0.9104, 0.544], 'Precision': 0.1121, 'F1': [0.0163, 0.4081, 0.0, 0.0517, 0.0304, 0.8036, 0.0, 0.1444, 0.0165, 0.0028, 0.0396, 0.0218, 0.4909, 0.0896], 'Recall': [0.0082, 0.8353, 0.0, 0.0377, 0.1893, 1.0, 0.0, 0.2442, 0.026, 0.003, 0.0655, 0.0115, 0.9104, 0.544]}]
Epoch 00001: val_loss improved from inf to 0.04004
epoch 2: 100%|██████████| 748/748 [00:12<00:00, 60.35it/s, loss=0.0342, metrics={'Accuracy': [0.0544, 0.5636, 0.0509, 0.0885, 0.0363, 0.7402, 0.0194, 0.1295, 0.0924, 0.0334, 0.146, 0.078, 0.7631, 0.1842], 'Precision': 0.2145, 'F1': [0.0747, 0.5727, 0.0601, 0.0905, 0.0557, 0.5727, 0.0306, 0.1088, 0.085, 0.0429, 0.1125, 0.0889, 0.7579, 0.1137], 'Recall': [0.0544, 0.5636, 0.0509, 0.0885, 0.0363, 0.7402, 0.0194, 0.1295, 0.0924, 0.0334, 0.146, 0.078, 0.7631, 0.1842]}]
valid: 100%|██████████| 1041/1041 [00:13<00:00, 79.96it/s, loss=0.0272, metrics={'Accuracy': [0.0024, 0.9364, 0.0, 0.0457, 0.2334, 1.0, 0.0, 0.1922, 0.015, 0.007, 0.1128, 0.044, 1.0, 0.5215], 'Precision': 0.113, 'F1': [0.0048, 0.3698, 0.0, 0.0637, 0.0216, 0.8366, 0.0, 0.125, 0.0111, 0.0061, 0.0726, 0.0708, 0.4504, 0.0934], 'Recall': [0.0024, 0.9364, 0.0, 0.0457, 0.2334, 1.0, 0.0, 0.1922, 0.015, 0.007, 0.1128, 0.044, 1.0, 0.5215]}]
Epoch 00002: val_loss improved from 0.04004 to 0.02722
epoch 3: 100%|██████████| 748/748 [00:12<00:00, 61.77it/s, loss=0.027, metrics={'Accuracy': [0.0791, 0.6588, 0.0598, 0.0924, 0.0569, 0.8017, 0.0207, 0.1098, 0.1057, 0.0589, 0.1376, 0.1063, 0.849, 0.1214], 'Precision': 0.2316, 'F1': [0.1013, 0.6285, 0.0669, 0.0927, 0.0804, 0.5838, 0.0317, 0.103, 0.0961, 0.0669, 0.1075, 0.1075, 0.8227, 0.1002], 'Recall': [0.0791, 0.6588, 0.0598, 0.0924, 0.0569, 0.8017, 0.0207, 0.1098, 0.1057, 0.0589, 0.1376, 0.1063, 0.849, 0.1214]}]
valid: 100%|██████████| 1041/1041 [00:14<00:00, 72.86it/s, loss=0.0246, metrics={'Accuracy': [0.0456, 1.0, 0.059, 0.0047, 0.224, 1.0, 0.0, 0.139, 0.05, 0.0795, 0.1643, 0.1107, 1.0, 0.3315], 'Precision': 0.1402, 'F1': [0.0873, 0.3314, 0.0351, 0.0093, 0.0189, 0.8196, 0.0, 0.0972, 0.0384, 0.0484, 0.0954, 0.1344, 0.4882, 0.0962], 'Recall': [0.0456, 1.0, 0.059, 0.0047, 0.224, 1.0, 0.0, 0.139, 0.05, 0.0795, 0.1643, 0.1107, 1.0, 0.3315]}]
Epoch 00003: val_loss improved from 0.02722 to 0.02458
epoch 4: 100%|██████████| 748/748 [00:13<00:00, 54.20it/s, loss=0.0248, metrics={'Accuracy': [0.0917, 0.7435, 0.078, 0.1007, 0.0858, 0.8367, 0.0423, 0.0905, 0.1033, 0.0728, 0.1381, 0.1088, 0.879, 0.0769], 'Precision': 0.2472, 'F1': [0.1115, 0.6877, 0.0815, 0.1009, 0.1126, 0.6177, 0.0576, 0.0912, 0.0948, 0.076, 0.1088, 0.1031, 0.8542, 0.0857], 'Recall': [0.0917, 0.7435, 0.078, 0.1007, 0.0858, 0.8367, 0.0423, 0.0905, 0.1033, 0.0728, 0.1381, 0.1088, 0.879, 0.0769]}]
valid: 100%|██████████| 1041/1041 [00:15<00:00, 65.74it/s, loss=0.0236, metrics={'Accuracy': [0.1012, 1.0, 0.0929, 0.015, 0.3438, 1.0, 0.0022, 0.0815, 0.0512, 0.1233, 0.1855, 0.153, 1.0, 0.2093], 'Precision': 0.1715, 'F1': [0.1837, 0.3962, 0.0566, 0.0253, 0.018, 0.8283, 0.0044, 0.0851, 0.0361, 0.0606, 0.0974, 0.1092, 0.4214, 0.0998], 'Recall': [0.1012, 1.0, 0.0929, 0.015, 0.3438, 1.0, 0.0022, 0.0815, 0.0512, 0.1233, 0.1855, 0.153, 1.0, 0.2093]}]
Epoch 00004: val_loss improved from 0.02458 to 0.02363
epoch 5: 100%|██████████| 748/748 [00:13<00:00, 53.66it/s, loss=0.0238, metrics={'Accuracy': [0.1076, 0.8108, 0.083, 0.1031, 0.1029, 0.863, 0.0589, 0.0871, 0.102, 0.0793, 0.1402, 0.1161, 0.8949, 0.0486], 'Precision': 0.2583, 'F1': [0.1279, 0.7308, 0.0829, 0.1019, 0.1253, 0.6538, 0.073, 0.0924, 0.0956, 0.0792, 0.1131, 0.1104, 0.8763, 0.063], 'Recall': [0.1076, 0.8108, 0.083, 0.1031, 0.1029, 0.863, 0.0589, 0.0871, 0.102, 0.0793, 0.1402, 0.1161, 0.8949, 0.0486]}]
valid: 100%|██████████| 1041/1041 [00:17<00:00, 58.36it/s, loss=0.0232, metrics={'Accuracy': [0.1325, 1.0, 0.1183, 0.0223, 0.3407, 1.0, 0.0457, 0.052, 0.07, 0.1395, 0.1618, 0.1595, 1.0, 0.1252], 'Precision': 0.1887, 'F1': [0.2336, 0.3629, 0.0577, 0.0369, 0.0151, 0.8704, 0.0649, 0.0666, 0.0568, 0.0635, 0.107, 0.1213, 0.4621, 0.084], 'Recall': [0.1325, 1.0, 0.1183, 0.0223, 0.3407, 1.0, 0.0457, 0.052, 0.07, 0.1395, 0.1618, 0.1595, 1.0, 0.1252]}]
Epoch 00005: val_loss improved from 0.02363 to 0.02323 Model weights restored to best epoch: 5
predict: 100%|██████████| 1041/1041 [00:05<00:00, 179.81it/s]
Classification report:
precision recall f1-score support
0 0.98 0.13 0.23 59760
1 0.22 1.00 0.37 3897
2 0.03 0.11 0.05 3897
3 0.11 0.02 0.04 4000
4 0.01 0.29 0.01 316
5 0.77 1.00 0.87 4002
6 0.11 0.05 0.07 4000
7 0.09 0.05 0.07 4000
8 0.05 0.08 0.06 4001
9 0.04 0.14 0.07 4001
10 0.08 0.16 0.11 4000
11 0.11 0.18 0.14 4000
12 0.29 0.98 0.45 133
13 0.06 0.12 0.08 4000
accuracy 0.19 104007
macro avg 0.21 0.31 0.19 104007
weighted avg 0.63 0.19 0.21 104007
In [ ]:
Copied!
predicted = pipeline.predict(data, column_types_loc, save_loc)
predicted = pipeline.predict(data, column_types_loc, save_loc)
Outlier_model¶
- outlier_model code that implementes outlier_model.predict() function
In [ ]:
Copied!
In [23]:
Copied!
data_test = pd.DataFrame(models["scaler"].inverse_transform(data_test_scaled.drop(columns=cat_cols + [identifier,target])),
columns=data_test_scaled.drop(columns=cat_cols + [identifier,target]).columns.values)
data_test[[identifier,target]] = data_test_scaled[[identifier,target]].copy()
data_test = pd.DataFrame(models["scaler"].inverse_transform(data_test_scaled.drop(columns=cat_cols + [identifier,target])),
columns=data_test_scaled.drop(columns=cat_cols + [identifier,target]).columns.values)
data_test[[identifier,target]] = data_test_scaled[[identifier,target]].copy()
In [25]:
Copied!
data_test.head()
data_test.head()
Out[25]:
| f_c | P | m_d | m_q | theta | P_ref | V_DC | V_phaseA | V_phaseB | V_phaseC | I_phaseA | I_phaseB | I_phaseC | sample_id | fault | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 50.000254 | 2494.761546 | 311.0 | 0.0 | 777.396328 | 2500.0 | 800.0 | 2.869625 | -270.757247 | 267.887622 | -0.150869 | -4.556102 | 4.706970 | 1069662 | 0 |
| 1 | 50.000006 | 2499.877565 | 311.0 | 0.0 | 1172.238561 | 2500.0 | 800.0 | -259.870040 | -18.022479 | 277.892519 | -228.781908 | 407.314521 | -178.532613 | 614815 | 7 |
| 2 | 50.000000 | 2499.999997 | 311.0 | 0.0 | 1424.037213 | 2500.0 | 800.0 | -153.983148 | -157.011936 | 310.995084 | -357.403522 | 349.748723 | 7.654799 | 630845 | 7 |
| 3 | 50.000000 | 2499.999999 | 311.0 | 0.0 | 649.626528 | 2500.0 | 800.0 | -268.948563 | 269.717592 | -0.769029 | -4.415660 | 4.864604 | -0.448944 | 741526 | 0 |
| 4 | 50.124340 | 0.000000 | 311.0 | 0.0 | 1131.328035 | 2500.0 | 800.0 | 0.000000 | 0.000000 | 0.000000 | 290.215924 | -60.868123 | -229.347801 | 452148 | 5 |
- loading the whole DL model with preprocessors in each iteration is stupid but I wanted to send Alex at least some initial code
In [ ]:
Copied!
fault = False
for index, row in data_test.iterrows():
if not fault:
fault = outlier_model.predict(row)
print("fault start")
else:
predicted = pipeline.predict(row, column_types_loc, save_loc)
print("fault continues")
if predicted == 0:
fault = False
print("fault ended")
fault = False
for index, row in data_test.iterrows():
if not fault:
fault = outlier_model.predict(row)
print("fault start")
else:
predicted = pipeline.predict(row, column_types_loc, save_loc)
print("fault continues")
if predicted == 0:
fault = False
print("fault ended")