1. Libraries¶

Additional libraries¶

arff - library to export dataset to arff files usable by MOA
https://pypi.org/project/arff/
imblearn - library for dealing with imbalanced datasets (in general, we won't need it in case of artificially created TEP datasets) https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.RandomUnderSampler.html

In [1]:

            
                Copied!
                
%pip install arff
# install the libraries, "Jupyter magic" must be in 1st line
%pip install arff
# install the libraries, "Jupyter magic" must be in 1st line

Requirement already satisfied: arff in c:\programdata\anaconda3\lib\site-packages (0.9)
Note: you may need to restart the kernel to use updated packages.

WARNING: You are using pip version 19.2.2, however version 20.0.2 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.

In [1]:

            
                Copied!
                
import glob
import arff
import pandas as pd
from sklearn.preprocessing import StandardScaler

%matplotlib inline
import glob
import arff
import pandas as pd
from sklearn.preprocessing import StandardScaler

%matplotlib inline

2. Dataset¶

Gather the data and create single dataset which can be later divided into training/testing.

In [7]:

            
                Copied!
                
                    
                    
                
                

        
# read all files
paths = glob.glob("Tennessee_Event-Driven/datasets/d*.dat")
paths.sort()
# create a list of corresponding fault ids (there are 2 files train/test per each fault)
fault_ids = [i for i in range(22) for _ in range(2)]
# I extracted the columns/features names from Harward TEP dataset
col_names = [
    "xmeas_1",
    "xmeas_2",
    "xmeas_3",
    "xmeas_4",
    "xmeas_5",
    "xmeas_6",
    "xmeas_7",
    "xmeas_8",
    "xmeas_9",
    "xmeas_10",
    "xmeas_11",
    "xmeas_12",
    "xmeas_13",
    "xmeas_14",
    "xmeas_15",
    "xmeas_16",
    "xmeas_17",
    "xmeas_18",
    "xmeas_19",
    "xmeas_20",
    "xmeas_21",
    "xmeas_22",
    "xmeas_23",
    "xmeas_24",
    "xmeas_25",
    "xmeas_26",
    "xmeas_27",
    "xmeas_28",
    "xmeas_29",
    "xmeas_30",
    "xmeas_31",
    "xmeas_32",
    "xmeas_33",
    "xmeas_34",
    "xmeas_35",
    "xmeas_36",
    "xmeas_37",
    "xmeas_38",
    "xmeas_39",
    "xmeas_40",
    "xmeas_41",
    "xmv_1",
    "xmv_2",
    "xmv_3",
    "xmv_4",
    "xmv_5",
    "xmv_6",
    "xmv_7",
    "xmv_8",
    "xmv_9",
    "xmv_10",
    "xmv_11",
]
# read all files
paths = glob.glob("Tennessee_Event-Driven/datasets/d*.dat")
paths.sort()
# create a list of corresponding fault ids (there are 2 files train/test per each fault)
fault_ids = [i for i in range(22) for _ in range(2)]
# I extracted the columns/features names from Harward TEP dataset
col_names = [
    "xmeas_1",
    "xmeas_2",
    "xmeas_3",
    "xmeas_4",
    "xmeas_5",
    "xmeas_6",
    "xmeas_7",
    "xmeas_8",
    "xmeas_9",
    "xmeas_10",
    "xmeas_11",
    "xmeas_12",
    "xmeas_13",
    "xmeas_14",
    "xmeas_15",
    "xmeas_16",
    "xmeas_17",
    "xmeas_18",
    "xmeas_19",
    "xmeas_20",
    "xmeas_21",
    "xmeas_22",
    "xmeas_23",
    "xmeas_24",
    "xmeas_25",
    "xmeas_26",
    "xmeas_27",
    "xmeas_28",
    "xmeas_29",
    "xmeas_30",
    "xmeas_31",
    "xmeas_32",
    "xmeas_33",
    "xmeas_34",
    "xmeas_35",
    "xmeas_36",
    "xmeas_37",
    "xmeas_38",
    "xmeas_39",
    "xmeas_40",
    "xmeas_41",
    "xmv_1",
    "xmv_2",
    "xmv_3",
    "xmv_4",
    "xmv_5",
    "xmv_6",
    "xmv_7",
    "xmv_8",
    "xmv_9",
    "xmv_10",
    "xmv_11",
]

In [9]:

            
                Copied!
                
                    
                    
                
                

        
data = np.empty([0, 52])
id_col = []
for file, fault_id in zip(paths, fault_ids):
    data_temp = np.genfromtxt(file)
    # d00.dat file is rotated - 52 rows/480 columns, all other files have 52 columns and 480/960 rows (train/test)
    if "d00.dat" in file:
        data_temp = np.rot90(data_temp)
    data = np.vstack((data, data_temp))
    id_col.extend(np.ones(len(data_temp)) * fault_id)
dataset = pd.DataFrame(data=data, columns=col_names)
dataset["fault_id"] = id_col
dataset["fault_id"] = dataset["fault_id"].astype(int)

dataset.to_csv("Tennessee_Event-Driven/datasets/dataset.csv", index=False)
data = np.empty([0, 52])
id_col = []
for file, fault_id in zip(paths, fault_ids):
    data_temp = np.genfromtxt(file)
    # d00.dat file is rotated - 52 rows/480 columns, all other files have 52 columns and 480/960 rows (train/test)
    if "d00.dat" in file:
        data_temp = np.rot90(data_temp)
    data = np.vstack((data, data_temp))
    id_col.extend(np.ones(len(data_temp)) * fault_id)
dataset = pd.DataFrame(data=data, columns=col_names)
dataset["fault_id"] = id_col
dataset["fault_id"] = dataset["fault_id"].astype(int)

dataset.to_csv("Tennessee_Event-Driven/datasets/dataset.csv", index=False)

3. Drop NA values¶

In [19]:

            
                Copied!
                
print("Fraction of NA values : " + str(dataset.isna().values.sum() / len(dataset)))
dataset.dropna(inplace=True)
print("Fraction of NA values : " + str(dataset.isna().values.sum() / len(dataset)))
dataset.dropna(inplace=True)

Fraction of NA values : 0.0

4. Check if the dataset is balanced - i.e. ~same number of samples per fault_id¶

In [13]:

            
                Copied!
                
fig, ax = plt.subplots(1, 1, figsize=(18, 3), facecolor="w", edgecolor="k")
fig.suptitle("Sample count per fault_id", size=20, y=1.12)

def applyPlotStyle(ax):
    ax.yaxis.grid(True, linestyle="-", which="both", color="lightgrey")
    # ax.set_ylim([0,0.55])
    ax.set_xlabel("fault_id")
    ax.set_ylabel("Fraction of all samples")

dataset["fault_id"].value_counts(sort=False, normalize=True).sort_index().plot(kind="bar", ax=ax)
applyPlotStyle(ax)
ax.set_title("TEP dataset");
fig, ax = plt.subplots(1, 1, figsize=(18, 3), facecolor="w", edgecolor="k")
fig.suptitle("Sample count per fault_id", size=20, y=1.12)

def applyPlotStyle(ax):
    ax.yaxis.grid(True, linestyle="-", which="both", color="lightgrey")
    # ax.set_ylim([0,0.55])
    ax.set_xlabel("fault_id")
    ax.set_ylabel("Fraction of all samples")

dataset["fault_id"].value_counts(sort=False, normalize=True).sort_index().plot(kind="bar", ax=ax)
applyPlotStyle(ax)
ax.set_title("TEP dataset");

4.1. Simple random under/over-sampling to overcome the imbalance¶

In [ ]:

            
                Copied!
                
                    
                    
                
                

        
# define undersample strategy
# https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.RandomUnderSampler.html
# 'not minority' = resample all classes to the size of minority class
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

undersample = RandomUnderSampler(sampling_strategy="not minority")
oversample = RandomOverSampler(sampling_strategy="not majority")

dataset_under, labels_under = undersample.fit_resample(dataset[col_names], dataset["fault_id"])
dataset_over, labels_over = oversample.fit_resample(dataset[col_names], dataset["fault_id"])

dataset_under = pd.DataFrame(data=dataset_under, columns=col_names)
dataset_under["fault_id"] = dataset["fault_id"].copy()

dataset_over = pd.DataFrame(data=dataset_over, columns=col_names)
dataset_over["fault_id"] = dataset["fault_id"].copy()
# define undersample strategy
# https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.RandomUnderSampler.html
# 'not minority' = resample all classes to the size of minority class
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

undersample = RandomUnderSampler(sampling_strategy="not minority")
oversample = RandomOverSampler(sampling_strategy="not majority")

dataset_under, labels_under = undersample.fit_resample(dataset[col_names], dataset["fault_id"])
dataset_over, labels_over = oversample.fit_resample(dataset[col_names], dataset["fault_id"])

dataset_under = pd.DataFrame(data=dataset_under, columns=col_names)
dataset_under["fault_id"] = dataset["fault_id"].copy()

dataset_over = pd.DataFrame(data=dataset_over, columns=col_names)
dataset_over["fault_id"] = dataset["fault_id"].copy()

5. Standard-scale the dataset for further processing¶

In [20]:

            
                Copied!
                
                    
                    
                
                

        
scaler = StandardScaler()
# scale only samples not fault_ids
scaler.fit(dataset[col_names])
dataset_scaled = pd.DataFrame(scaler.transform(dataset[col_names]), columns=col_names)
dataset_scaled["fault_id"] = dataset["fault_id"].copy()
dataset_scaled.to_csv("Tennessee_Event-Driven/datasets/dataset_standard_scaled.csv", index=False)
# create arff file for MOA
arff.dump(
    "Tennessee_Event-Driven/datasets/dataset_standard_scaled.arff",
    dataset_scaled.values,
    relation="TEP DATASET",
    names=dataset_scaled.columns.values,
)
scaler = StandardScaler()
# scale only samples not fault_ids
scaler.fit(dataset[col_names])
dataset_scaled = pd.DataFrame(scaler.transform(dataset[col_names]), columns=col_names)
dataset_scaled["fault_id"] = dataset["fault_id"].copy()
dataset_scaled.to_csv("Tennessee_Event-Driven/datasets/dataset_standard_scaled.csv", index=False)
# create arff file for MOA
arff.dump(
    "Tennessee_Event-Driven/datasets/dataset_standard_scaled.arff",
    dataset_scaled.values,
    relation="TEP DATASET",
    names=dataset_scaled.columns.values,
)

6. Dataset feature selection/analysis¶

We use original dataset - daaset before scaling. Scaling removes variance from features. Variance analysis is part of this section.

6.1 Visual check of features distribution per fault_id¶

Idea : Visually check distributions of features to gain idea if some features are better candidates for ML.

In [35]:

            
                Copied!
                
                    
                    
                
                

        
# it's f*cked up, but I have to pass exact number of ax to boxplot otherwise I am unable
# to have separate y-axis for each subplot
fig, ax = plt.subplots(8, 7, figsize=(18, 18), sharey=False, sharex=True)
ax = np.delete(ax.flatten(), [55, 54, 53, 52])
dataset.boxplot(ax=ax, by="fault_id", rot=90, showfliers=False)
fig.suptitle("Scaled TEP dataset feature boxplots, subplot=feature per fault_id", size=30)
plt.subplots_adjust(hspace=0.5);
# it's f*cked up, but I have to pass exact number of ax to boxplot otherwise I am unable
# to have separate y-axis for each subplot
fig, ax = plt.subplots(8, 7, figsize=(18, 18), sharey=False, sharex=True)
ax = np.delete(ax.flatten(), [55, 54, 53, 52])
dataset.boxplot(ax=ax, by="fault_id", rot=90, showfliers=False)
fig.suptitle("Scaled TEP dataset feature boxplots, subplot=feature per fault_id", size=30)
plt.subplots_adjust(hspace=0.5);

C:\ProgramData\Anaconda3\lib\site-packages\pandas\plotting\_core.py:2254: UserWarning: When passing multiple axes, sharex and sharey are ignored. These settings must be specified when creating axes
  return_type=return_type, **kwds)

6.2 Cross correlation of the features¶

Idea : remove redundant features
You may change the correlation_threshold to filter out features that have correlation above certain value.

In [36]:

            
                Copied!
                
                    
                    
                
                

        
# do not correlate fault_ids :)
samples = dataset[dataset.columns[:-1]]

dataset_corr = samples.corr()
correlation_threshold = 0.8
upper = dataset_corr.where(np.triu(np.ones(dataset_corr.shape), k=1).astype(np.bool))
# Select the features with correlations above the threshold
# absolute value - positive/negative correlation
to_drop = [column for column in upper.columns if any(upper[column].abs() > correlation_threshold)]
# do not correlate fault_ids :)
samples = dataset[dataset.columns[:-1]]

dataset_corr = samples.corr()
correlation_threshold = 0.8
upper = dataset_corr.where(np.triu(np.ones(dataset_corr.shape), k=1).astype(np.bool))
# Select the features with correlations above the threshold
# absolute value - positive/negative correlation
to_drop = [column for column in upper.columns if any(upper[column].abs() > correlation_threshold)]

In [37]:

            
                Copied!
                
                    
                    
                
                

        
# Dataframe to hold correlated pairs
dataset_collinear = pd.DataFrame(columns=["drop_feature", "corr_feature", "corr_value"])

# Iterate through the "columns to drop" to record pairs of correlated features
for column in to_drop:

    # Find the correlated features
    corr_features = list(upper.index[upper[column].abs() > correlation_threshold])

    # Find the correlated values
    corr_values = list(upper[column][upper[column].abs() > correlation_threshold])
    drop_features = [column for _ in range(len(corr_features))]

    # Record the information (need a temp df for now)
    temp_df = pd.DataFrame.from_dict(
        {
            "drop_feature": drop_features,
            "corr_feature": corr_features,
            "corr_value": corr_values,
        }
    )
    dataset_collinear = dataset_collinear.append(temp_df, ignore_index=True)

dataset_corr_filtered = dataset_corr.loc[
    list(set(dataset_collinear["corr_feature"])),
    list(set(dataset_collinear["drop_feature"])),
]
# Dataframe to hold correlated pairs
dataset_collinear = pd.DataFrame(columns=["drop_feature", "corr_feature", "corr_value"])

# Iterate through the "columns to drop" to record pairs of correlated features
for column in to_drop:

    # Find the correlated features
    corr_features = list(upper.index[upper[column].abs() > correlation_threshold])

    # Find the correlated values
    corr_values = list(upper[column][upper[column].abs() > correlation_threshold])
    drop_features = [column for _ in range(len(corr_features))]

    # Record the information (need a temp df for now)
    temp_df = pd.DataFrame.from_dict(
        {
            "drop_feature": drop_features,
            "corr_feature": corr_features,
            "corr_value": corr_values,
        }
    )
    dataset_collinear = dataset_collinear.append(temp_df, ignore_index=True)

dataset_corr_filtered = dataset_corr.loc[
    list(set(dataset_collinear["corr_feature"])),
    list(set(dataset_collinear["drop_feature"])),
]

In [48]:

            
                Copied!
                
                    
                    
                
                

        
fig, ax = plt.subplots(1, 1, figsize=(10, 10))
from mpl_toolkits.axes_grid1 import make_axes_locatable

fig.suptitle("Correlation matrix above threshold = " + str(correlation_threshold), size=20)

heatmap = ax.matshow(dataset_corr_filtered, cmap=plt.cm.seismic)
ax.set_xticks(range(dataset_corr_filtered.shape[1]))
ax.set_xticklabels(dataset_corr_filtered.columns.values, fontsize=10, rotation=90)
ax.set_yticks(range(dataset_corr_filtered.shape[0]))
ax.set_yticklabels(dataset_corr_filtered.index.values, fontsize=10)

ax.set_xticks(np.arange(-0.5, dataset_corr_filtered.shape[1] - 1, 1), minor=True)
ax.set_yticks(np.arange(-0.5, dataset_corr_filtered.shape[0] - 1, 1), minor=True)
ax.grid(which="minor", color="w", linestyle="-", linewidth=2.0)

# create an axes on the right side of ax. The width of cax will be 5%
# of ax and the padding between cax and ax will be fixed at 0.05 inch.
divider = make_axes_locatable(ax)
cax = divider.append_axes("right", size="2%", pad=0.1)

cb = fig.colorbar(heatmap, cax=cax)
cb.ax.tick_params(labelsize=12)
fig.savefig(
    "Tennessee_Event-Driven/results/features_colinearity.pdf",
    transparent=True,
    bbox_inches="tight",
    pad_inches=0,
    frameon=False,
)
fig, ax = plt.subplots(1, 1, figsize=(10, 10))
from mpl_toolkits.axes_grid1 import make_axes_locatable

fig.suptitle("Correlation matrix above threshold = " + str(correlation_threshold), size=20)

heatmap = ax.matshow(dataset_corr_filtered, cmap=plt.cm.seismic)
ax.set_xticks(range(dataset_corr_filtered.shape[1]))
ax.set_xticklabels(dataset_corr_filtered.columns.values, fontsize=10, rotation=90)
ax.set_yticks(range(dataset_corr_filtered.shape[0]))
ax.set_yticklabels(dataset_corr_filtered.index.values, fontsize=10)

ax.set_xticks(np.arange(-0.5, dataset_corr_filtered.shape[1] - 1, 1), minor=True)
ax.set_yticks(np.arange(-0.5, dataset_corr_filtered.shape[0] - 1, 1), minor=True)
ax.grid(which="minor", color="w", linestyle="-", linewidth=2.0)

# create an axes on the right side of ax. The width of cax will be 5%
# of ax and the padding between cax and ax will be fixed at 0.05 inch.
divider = make_axes_locatable(ax)
cax = divider.append_axes("right", size="2%", pad=0.1)

cb = fig.colorbar(heatmap, cax=cax)
cb.ax.tick_params(labelsize=12)
fig.savefig(
    "Tennessee_Event-Driven/results/features_colinearity.pdf",
    transparent=True,
    bbox_inches="tight",
    pad_inches=0,
    frameon=False,
)

C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:22: MatplotlibDeprecationWarning: 
The frameon kwarg was deprecated in Matplotlib 3.1 and will be removed in 3.3. Use facecolor instead.

6.3. Variance of the features¶

Idea : remove features with low/zero variance -> most likely bad candidates for ML.

In [52]:

            
                Copied!
                
dataset_std = dataset.std().to_frame().rename(columns={"index": "feature", 0: "std"})
dataset_std = dataset_std.sort_values("std", ascending=False)
dataset_std = dataset.std().to_frame().rename(columns={"index": "feature", 0: "std"})
dataset_std = dataset_std.sort_values("std", ascending=False)

In [54]:

            
                Copied!
                
                    
                    
                
                

        
fig, ax = plt.subplots(1, 1, figsize=(16, 5))
fig.suptitle("Sorted features by their variance", size=20)
bar_plot = dataset_std.plot.bar(ax=ax, legend=False, color="black", width=0.6)
ax.tick_params(labelsize=12)
ax.set_xlabel("sorted features", size=12)
# ax.set_ylabel('std',size=12)
fig.savefig(
    "Tennessee_Event-Driven/results/feature_variance.pdf",
    transparent=True,
    bbox_inches="tight",
    pad_inches=0,
    frameon=False,
)
fig, ax = plt.subplots(1, 1, figsize=(16, 5))
fig.suptitle("Sorted features by their variance", size=20)
bar_plot = dataset_std.plot.bar(ax=ax, legend=False, color="black", width=0.6)
ax.tick_params(labelsize=12)
ax.set_xlabel("sorted features", size=12)
# ax.set_ylabel('std',size=12)
fig.savefig(
    "Tennessee_Event-Driven/results/feature_variance.pdf",
    transparent=True,
    bbox_inches="tight",
    pad_inches=0,
    frameon=False,
)

C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:7: MatplotlibDeprecationWarning: 
The frameon kwarg was deprecated in Matplotlib 3.1 and will be removed in 3.3. Use facecolor instead.
  import sys

6.4. Other feature selection methods algorithms¶

http://featureselection.asu.edu/

In [2]:

            
                Copied!
                
dataset = pd.read_csv("Tennessee_Event-Driven/datasets/dataset.csv", index_col=False)
dataset = pd.read_csv("Tennessee_Event-Driven/datasets/dataset.csv", index_col=False)

6.4.1. LAP score [example]¶

In [ ]:

            
                Copied!
                
                    
                    
                
                

        
from skfeature.function.similarity_based import lap_score

dataset_lap_score = lap_score.lap_score(dataset.values)

fig, ax = plt.subplots(1, 1, figsize=(16, 5))
first = 0
last = len(dataset.columns) - 1
ax.bar(
    np.arange(dataset.shape[1])[first:last],
    np.sort(dataset_lap_score)[first:last],
    color="black",
    width=0.8,
)
ax.tick_params(labelsize=10)
ax.set_ylim([0, 1])
ax.set_xlim([-0.5, last - first - 0.5])
ax.set_xticks(np.arange(dataset.shape[1])[first:last])
ax.set_xticklabels(dataset.columns.values[np.argsort(dataset_lap_score)][first:last], rotation=90)
from skfeature.function.similarity_based import lap_score

dataset_lap_score = lap_score.lap_score(dataset.values)

fig, ax = plt.subplots(1, 1, figsize=(16, 5))
first = 0
last = len(dataset.columns) - 1
ax.bar(
    np.arange(dataset.shape[1])[first:last],
    np.sort(dataset_lap_score)[first:last],
    color="black",
    width=0.8,
)
ax.tick_params(labelsize=10)
ax.set_ylim([0, 1])
ax.set_xlim([-0.5, last - first - 0.5])
ax.set_xticks(np.arange(dataset.shape[1])[first:last])
ax.set_xticklabels(dataset.columns.values[np.argsort(dataset_lap_score)][first:last], rotation=90)

In [ ]:

            
                Copied!
                
from skfeature.function.similarity_based import lap_score

dataset_lap_score = lap_score.lap_score(dataset.values)
from skfeature.function.similarity_based import lap_score

dataset_lap_score = lap_score.lap_score(dataset.values)

6.4.x OTHER scores¶

http://featureselection.asu.edu/algorithms.php