Imports¶

In [1]:

            
                Copied!
                
import pandas as pd
from src import pipeline
from src import common
import pandas as pd
from src import pipeline
from src import common

Dataset preprocessing¶

In [2]:

            
                Copied!
                
                    
                    
                
                

        
column_types_loc = "#datasets/Colab_PowerConverter/column_types.json"
column_types = common.json_load(column_types_loc)

target = column_types["target"]
identifier = column_types["identifier"]
cat_cols = column_types["categorical"]
measurement_label = column_types["measurement_label"]

data = pd.read_pickle(
    f"#datasets/Colab_PowerConverter/dataset.pkl"
)

# this measurement did not have a fault (?)
data = data[data[measurement_label]!="Single-Phase_Sensor_Fault"]
data.reset_index(inplace=True, drop=True)

# assign unique label to each measurement fault and create dictionary for easier analysis
fault_dict = {}
for label,i in zip(data[measurement_label].unique(), range(len(data[measurement_label].unique()))):
    data.loc[(data[measurement_label]==label) & (data[target]==1), target] = int(i+1)
    fault_dict[label] = int(i+1)

data.drop(columns=[measurement_label], inplace=True)
# Fill NA - 0 for numerical and 'NA' for categorical
# categorical
data[cat_cols] = data[cat_cols].fillna("NA")
data[cat_cols] = data[cat_cols].astype(str)
# non-categorical
non_cat_cols = data.drop(columns=cat_cols + [identifier]).columns.tolist()
data[non_cat_cols] = data[non_cat_cols].fillna(0)
column_types_loc = "#datasets/Colab_PowerConverter/column_types.json"
column_types = common.json_load(column_types_loc)

target = column_types["target"]
identifier = column_types["identifier"]
cat_cols = column_types["categorical"]
measurement_label = column_types["measurement_label"]

data = pd.read_pickle(
    f"#datasets/Colab_PowerConverter/dataset.pkl"
)

# this measurement did not have a fault (?)
data = data[data[measurement_label]!="Single-Phase_Sensor_Fault"]
data.reset_index(inplace=True, drop=True)

# assign unique label to each measurement fault and create dictionary for easier analysis
fault_dict = {}
for label,i in zip(data[measurement_label].unique(), range(len(data[measurement_label].unique()))):
    data.loc[(data[measurement_label]==label) & (data[target]==1), target] = int(i+1)
    fault_dict[label] = int(i+1)

data.drop(columns=[measurement_label], inplace=True)
# Fill NA - 0 for numerical and 'NA' for categorical
# categorical
data[cat_cols] = data[cat_cols].fillna("NA")
data[cat_cols] = data[cat_cols].astype(str)
# non-categorical
non_cat_cols = data.drop(columns=cat_cols + [identifier]).columns.tolist()
data[non_cat_cols] = data[non_cat_cols].fillna(0)

Train model¶

In [3]:

            
                Copied!
                
                    
                    
                
                

        
task = "multiclass"

column_types_loc = "#datasets/Colab_PowerConverter/column_types.json"
save_loc = "models.dill"

parameters = {
    "random_state": 1,
    "test_size_train": 0.2,
    "test_size_valid": 0.5,
    "scaler": "Standard",
}
task = "multiclass"

column_types_loc = "#datasets/Colab_PowerConverter/column_types.json"
save_loc = "models.dill"

parameters = {
    "random_state": 1,
    "test_size_train": 0.2,
    "test_size_valid": 0.5,
    "scaler": "Standard",
}

In [4]:

            
                Copied!
                
                    
                    
                
                

        
(
    data_train_scaled,
    data_valid_scaled,
    data_test_scaled,
    models,
) = pipeline.train(task,data, column_types_loc, parameters, save_loc=save_loc, verbose=True, datasets=True)
(
    data_train_scaled,
    data_valid_scaled,
    data_test_scaled,
    models,
) = pipeline.train(task,data, column_types_loc, parameters, save_loc=save_loc, verbose=True, datasets=True)

Size of dataset classes:
0     597599
5      40014
3      40001
6      40001
7      40001
8      40001
9      40001
10     40001
11     40001
13     40001
1      38971
2      38971
4       3166
12      1335
Name: fault, dtype: int64

/home/palo/miniconda3/lib/python3.8/site-packages/pytorch_widedeep/preprocessing/tab_preprocessor.py:202: UserWarning: Continuous columns will not be normalised
  warnings.warn("Continuous columns will not be normalised")
epoch 1: 100%|██████████| 748/748 [00:12<00:00, 59.65it/s, loss=0.0901, metrics={'Accuracy': [0.0709, 0.3708, 0.0576, 0.0845, 0.0206, 0.574, 0.0347, 0.1306, 0.086, 0.0327, 0.1312, 0.066, 0.5213, 0.1361], 'Precision': 0.1665, 'F1': [0.0846, 0.3402, 0.0585, 0.0863, 0.0343, 0.4997, 0.0449, 0.1097, 0.078, 0.0404, 0.1053, 0.0765, 0.5182, 0.1002], 'Recall': [0.0709, 0.3708, 0.0576, 0.0845, 0.0206, 0.574, 0.0347, 0.1306, 0.086, 0.0327, 0.1312, 0.066, 0.5213, 0.1361]}]
valid: 100%|██████████| 1041/1041 [00:14<00:00, 73.08it/s, loss=0.04, metrics={'Accuracy': [0.0082, 0.8353, 0.0, 0.0377, 0.1893, 1.0, 0.0, 0.2442, 0.026, 0.003, 0.0655, 0.0115, 0.9104, 0.544], 'Precision': 0.1121, 'F1': [0.0163, 0.4081, 0.0, 0.0517, 0.0304, 0.8036, 0.0, 0.1444, 0.0165, 0.0028, 0.0396, 0.0218, 0.4909, 0.0896], 'Recall': [0.0082, 0.8353, 0.0, 0.0377, 0.1893, 1.0, 0.0, 0.2442, 0.026, 0.003, 0.0655, 0.0115, 0.9104, 0.544]}]

Epoch 00001: val_loss improved from inf to 0.04004

epoch 2: 100%|██████████| 748/748 [00:12<00:00, 60.35it/s, loss=0.0342, metrics={'Accuracy': [0.0544, 0.5636, 0.0509, 0.0885, 0.0363, 0.7402, 0.0194, 0.1295, 0.0924, 0.0334, 0.146, 0.078, 0.7631, 0.1842], 'Precision': 0.2145, 'F1': [0.0747, 0.5727, 0.0601, 0.0905, 0.0557, 0.5727, 0.0306, 0.1088, 0.085, 0.0429, 0.1125, 0.0889, 0.7579, 0.1137], 'Recall': [0.0544, 0.5636, 0.0509, 0.0885, 0.0363, 0.7402, 0.0194, 0.1295, 0.0924, 0.0334, 0.146, 0.078, 0.7631, 0.1842]}]
valid: 100%|██████████| 1041/1041 [00:13<00:00, 79.96it/s, loss=0.0272, metrics={'Accuracy': [0.0024, 0.9364, 0.0, 0.0457, 0.2334, 1.0, 0.0, 0.1922, 0.015, 0.007, 0.1128, 0.044, 1.0, 0.5215], 'Precision': 0.113, 'F1': [0.0048, 0.3698, 0.0, 0.0637, 0.0216, 0.8366, 0.0, 0.125, 0.0111, 0.0061, 0.0726, 0.0708, 0.4504, 0.0934], 'Recall': [0.0024, 0.9364, 0.0, 0.0457, 0.2334, 1.0, 0.0, 0.1922, 0.015, 0.007, 0.1128, 0.044, 1.0, 0.5215]}]

Epoch 00002: val_loss improved from 0.04004 to 0.02722

epoch 3: 100%|██████████| 748/748 [00:12<00:00, 61.77it/s, loss=0.027, metrics={'Accuracy': [0.0791, 0.6588, 0.0598, 0.0924, 0.0569, 0.8017, 0.0207, 0.1098, 0.1057, 0.0589, 0.1376, 0.1063, 0.849, 0.1214], 'Precision': 0.2316, 'F1': [0.1013, 0.6285, 0.0669, 0.0927, 0.0804, 0.5838, 0.0317, 0.103, 0.0961, 0.0669, 0.1075, 0.1075, 0.8227, 0.1002], 'Recall': [0.0791, 0.6588, 0.0598, 0.0924, 0.0569, 0.8017, 0.0207, 0.1098, 0.1057, 0.0589, 0.1376, 0.1063, 0.849, 0.1214]}]
valid: 100%|██████████| 1041/1041 [00:14<00:00, 72.86it/s, loss=0.0246, metrics={'Accuracy': [0.0456, 1.0, 0.059, 0.0047, 0.224, 1.0, 0.0, 0.139, 0.05, 0.0795, 0.1643, 0.1107, 1.0, 0.3315], 'Precision': 0.1402, 'F1': [0.0873, 0.3314, 0.0351, 0.0093, 0.0189, 0.8196, 0.0, 0.0972, 0.0384, 0.0484, 0.0954, 0.1344, 0.4882, 0.0962], 'Recall': [0.0456, 1.0, 0.059, 0.0047, 0.224, 1.0, 0.0, 0.139, 0.05, 0.0795, 0.1643, 0.1107, 1.0, 0.3315]}]

Epoch 00003: val_loss improved from 0.02722 to 0.02458

epoch 4: 100%|██████████| 748/748 [00:13<00:00, 54.20it/s, loss=0.0248, metrics={'Accuracy': [0.0917, 0.7435, 0.078, 0.1007, 0.0858, 0.8367, 0.0423, 0.0905, 0.1033, 0.0728, 0.1381, 0.1088, 0.879, 0.0769], 'Precision': 0.2472, 'F1': [0.1115, 0.6877, 0.0815, 0.1009, 0.1126, 0.6177, 0.0576, 0.0912, 0.0948, 0.076, 0.1088, 0.1031, 0.8542, 0.0857], 'Recall': [0.0917, 0.7435, 0.078, 0.1007, 0.0858, 0.8367, 0.0423, 0.0905, 0.1033, 0.0728, 0.1381, 0.1088, 0.879, 0.0769]}]
valid: 100%|██████████| 1041/1041 [00:15<00:00, 65.74it/s, loss=0.0236, metrics={'Accuracy': [0.1012, 1.0, 0.0929, 0.015, 0.3438, 1.0, 0.0022, 0.0815, 0.0512, 0.1233, 0.1855, 0.153, 1.0, 0.2093], 'Precision': 0.1715, 'F1': [0.1837, 0.3962, 0.0566, 0.0253, 0.018, 0.8283, 0.0044, 0.0851, 0.0361, 0.0606, 0.0974, 0.1092, 0.4214, 0.0998], 'Recall': [0.1012, 1.0, 0.0929, 0.015, 0.3438, 1.0, 0.0022, 0.0815, 0.0512, 0.1233, 0.1855, 0.153, 1.0, 0.2093]}]

Epoch 00004: val_loss improved from 0.02458 to 0.02363

epoch 5: 100%|██████████| 748/748 [00:13<00:00, 53.66it/s, loss=0.0238, metrics={'Accuracy': [0.1076, 0.8108, 0.083, 0.1031, 0.1029, 0.863, 0.0589, 0.0871, 0.102, 0.0793, 0.1402, 0.1161, 0.8949, 0.0486], 'Precision': 0.2583, 'F1': [0.1279, 0.7308, 0.0829, 0.1019, 0.1253, 0.6538, 0.073, 0.0924, 0.0956, 0.0792, 0.1131, 0.1104, 0.8763, 0.063], 'Recall': [0.1076, 0.8108, 0.083, 0.1031, 0.1029, 0.863, 0.0589, 0.0871, 0.102, 0.0793, 0.1402, 0.1161, 0.8949, 0.0486]}]
valid: 100%|██████████| 1041/1041 [00:17<00:00, 58.36it/s, loss=0.0232, metrics={'Accuracy': [0.1325, 1.0, 0.1183, 0.0223, 0.3407, 1.0, 0.0457, 0.052, 0.07, 0.1395, 0.1618, 0.1595, 1.0, 0.1252], 'Precision': 0.1887, 'F1': [0.2336, 0.3629, 0.0577, 0.0369, 0.0151, 0.8704, 0.0649, 0.0666, 0.0568, 0.0635, 0.107, 0.1213, 0.4621, 0.084], 'Recall': [0.1325, 1.0, 0.1183, 0.0223, 0.3407, 1.0, 0.0457, 0.052, 0.07, 0.1395, 0.1618, 0.1595, 1.0, 0.1252]}]

Epoch 00005: val_loss improved from 0.02363 to 0.02323
Model weights restored to best epoch: 5

predict: 100%|██████████| 1041/1041 [00:05<00:00, 179.81it/s]

Classification report:
              precision    recall  f1-score   support

           0       0.98      0.13      0.23     59760
           1       0.22      1.00      0.37      3897
           2       0.03      0.11      0.05      3897
           3       0.11      0.02      0.04      4000
           4       0.01      0.29      0.01       316
           5       0.77      1.00      0.87      4002
           6       0.11      0.05      0.07      4000
           7       0.09      0.05      0.07      4000
           8       0.05      0.08      0.06      4001
           9       0.04      0.14      0.07      4001
          10       0.08      0.16      0.11      4000
          11       0.11      0.18      0.14      4000
          12       0.29      0.98      0.45       133
          13       0.06      0.12      0.08      4000

    accuracy                           0.19    104007
   macro avg       0.21      0.31      0.19    104007
weighted avg       0.63      0.19      0.21    104007

In [ ]:

            
                Copied!
                
predicted = pipeline.predict(data, column_types_loc, save_loc)
predicted = pipeline.predict(data, column_types_loc, save_loc)

Outlier_model¶

outlier_model code that implementes outlier_model.predict() function

In [ ]:

Simulate stream of data¶

Inverse transform test dataset for evaluation¶

In [23]:

            
                Copied!
                
data_test = pd.DataFrame(models["scaler"].inverse_transform(data_test_scaled.drop(columns=cat_cols + [identifier,target])),
               columns=data_test_scaled.drop(columns=cat_cols + [identifier,target]).columns.values)
data_test[[identifier,target]] = data_test_scaled[[identifier,target]].copy()

data_test = pd.DataFrame(models["scaler"].inverse_transform(data_test_scaled.drop(columns=cat_cols + [identifier,target])),
               columns=data_test_scaled.drop(columns=cat_cols + [identifier,target]).columns.values)
data_test[[identifier,target]] = data_test_scaled[[identifier,target]].copy()

In [25]:

            
                Copied!
                
data_test.head()
data_test.head()

Out[25]:

	f_c	P	m_d	theta	P_ref	V_DC	V_phaseA	V_phaseB	V_phaseC	I_phaseA	I_phaseB	I_phaseC	sample_id	fault
0	50.000254	2494.761546	311.0	777.396328	2500.0	800.0	2.869625	-270.757247	267.887622	-0.150869	-4.556102	4.706970	1069662	0
1	50.000006	2499.877565	311.0	1172.238561	2500.0	800.0	-259.870040	-18.022479	277.892519	-228.781908	407.314521	-178.532613	614815	7
2	50.000000	2499.999997	311.0	1424.037213	2500.0	800.0	-153.983148	-157.011936	310.995084	-357.403522	349.748723	7.654799	630845	7
3	50.000000	2499.999999	311.0	649.626528	2500.0	800.0	-268.948563	269.717592	-0.769029	-4.415660	4.864604	-0.448944	741526	0
4	50.124340	0.000000	311.0	1131.328035	2500.0	800.0	0.000000	0.000000	0.000000	290.215924	-60.868123	-229.347801	452148	5

loading the whole DL model with preprocessors in each iteration is stupid but I wanted to send Alex at least some initial code

In [ ]:

            
                Copied!
                
                    
                    
                
                

        
fault = False
for index, row in data_test.iterrows():
    if not fault:
        fault = outlier_model.predict(row)
        print("fault start")
    else:
        predicted = pipeline.predict(row, column_types_loc, save_loc)
        print("fault continues")
        if predicted == 0:
            fault = False
            print("fault ended")
fault = False
for index, row in data_test.iterrows():
    if not fault:
        fault = outlier_model.predict(row)
        print("fault start")
    else:
        predicted = pipeline.predict(row, column_types_loc, save_loc)
        print("fault continues")
        if predicted == 0:
            fault = False
            print("fault ended")