Batch machine learning pipeline¶

OUTLINE:¶

dataset preprocessing
classification pipeline - Random Forest
results improvement
feautre engineering
dataset : TEP

drawing

Random Forest¶

drawing

[1] Verikas, Antanas, et al. "Electromyographic patterns during golf swing: Activation sequence profiling and prediction of shot effectiveness." Sensors 16.4 (2016): 592.

libraries¶

In [1]:

            
                Copied!
                
                    
                    
                
                

        
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from skfeature.function.similarity_based import lap_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer

#import xgboost

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline

import glob
import pandas as pd
import numpy as np
from random import randrange

import seaborn as sns
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from skfeature.function.similarity_based import lap_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer

#import xgboost

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline

import glob
import pandas as pd
import numpy as np
from random import randrange

import seaborn as sns
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

Create a Workshop TEP dataset wih imbalanced classes for batch learning¶

In [66]:

            
                Copied!
                
                    
                    
                
                

        
# drop_indices = []
# for fault in dataset['fault_id'].unique():
#     min_index = dataset[dataset['fault_id']==fault].index.min()
#     max_index = dataset[dataset['fault_id']==fault].index.max()
#     a = randrange(min_index, max_index)
#     b = randrange(min_index, max_index)
#     if a<b:
#         to_drop = np.arange(a,b)
#     else:
#         to_drop = np.arange(b,a)
#     drop_indices.extend(to_drop)
# drop_indices = []
# for fault in dataset['fault_id'].unique():
#     min_index = dataset[dataset['fault_id']==fault].index.min()
#     max_index = dataset[dataset['fault_id']==fault].index.max()
#     a = randrange(min_index, max_index)
#     b = randrange(min_index, max_index)
#     if a
    
        

In [68]:

            
                Copied!
                
# dataset = pd.read_csv('dataset.csv', index_col=False)
# dataset.drop(drop_indices).to_csv('workshop_dataset.csv', index=False)
# dataset.iloc[drop_indices].to_csv('workshop_dataset_stream_testing.csv', index=False)
# dataset = pd.read_csv('dataset.csv', index_col=False)
# dataset.drop(drop_indices).to_csv('workshop_dataset.csv', index=False)
# dataset.iloc[drop_indices].to_csv('workshop_dataset_stream_testing.csv', index=False)

1. Preprocessing¶

In [2]:

            
                Copied!
                
dataset = pd.read_csv('workshop_dataset.csv',index_col=False)
col_names = dataset.columns.tolist()
dataset = pd.read_csv('workshop_dataset.csv',index_col=False)
col_names = dataset.columns.tolist()

In [3]:

            
                Copied!
                
                    
                    
                
                

        
fig, ax = plt.subplots(8,7, figsize=(18, 18))
[fig.delaxes(ax[7,i]) for i in [3,4,5,6]]
ax = np.delete(ax.flatten(), [52,53,54,55])
dataset.boxplot(ax=ax, by='fault_id', rot=90, showfliers=False)
[ax.set_xlabel('') for ax in ax.flatten()[:-7]]
[ax.set_xticklabels([]) for ax in ax.flatten()[:-7]]
fig.subplots_adjust(hspace = 0.3, wspace=0.3)
fig.suptitle("TEP dataset feature boxplots, subplot=feature per fault_id", size=30, y=0.95);
fig, ax = plt.subplots(8,7, figsize=(18, 18))
[fig.delaxes(ax[7,i]) for i in [3,4,5,6]]
ax = np.delete(ax.flatten(), [52,53,54,55])
dataset.boxplot(ax=ax, by='fault_id', rot=90, showfliers=False)
[ax.set_xlabel('') for ax in ax.flatten()[:-7]]
[ax.set_xticklabels([]) for ax in ax.flatten()[:-7]]
fig.subplots_adjust(hspace = 0.3, wspace=0.3)
fig.suptitle("TEP dataset feature boxplots, subplot=feature per fault_id", size=30, y=0.95);

In [4]:

            
                Copied!
                
fig, ax = plt.subplots(1, 1, figsize=(18, 3), facecolor='w', edgecolor='k')
fig.suptitle('Sample count per fault_id', size=20, y=1.12)
sns.countplot(ax = ax, x='fault_id', data=dataset,color='navy');
fig, ax = plt.subplots(1, 1, figsize=(18, 3), facecolor='w', edgecolor='k')
fig.suptitle('Sample count per fault_id', size=20, y=1.12)
sns.countplot(ax = ax, x='fault_id', data=dataset,color='navy');

1.2. Class imbalance impact¶

In [5]:

            
                Copied!
                
                    
                    
                
                

        
# define undersample strategy
# https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.RandomUnderSampler.html
# 'not minority' = resample all classes to the size of minority class
undersample = RandomUnderSampler(sampling_strategy='not minority')
oversample = RandomOverSampler(sampling_strategy='not majority')

dataset_under_values, labels_under = undersample.fit_resample(dataset.drop(columns=['fault_id']), dataset['fault_id'])
dataset_under = pd.DataFrame(np.concatenate([dataset_under_values, labels_under.reshape(-1,1)], axis=1), columns = col_names)
dataset_under['type'] = 'under'
dataset_over_values, labels_over = oversample.fit_resample(dataset.drop(columns=['fault_id']), dataset['fault_id'])
dataset_over = pd.DataFrame(np.concatenate([dataset_over_values, labels_over.reshape(-1,1)], axis=1), columns = col_names)
dataset_over['type'] = 'over'

dataset['type'] = 'original'
# define undersample strategy
# https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.RandomUnderSampler.html
# 'not minority' = resample all classes to the size of minority class
undersample = RandomUnderSampler(sampling_strategy='not minority')
oversample = RandomOverSampler(sampling_strategy='not majority')

dataset_under_values, labels_under = undersample.fit_resample(dataset.drop(columns=['fault_id']), dataset['fault_id'])
dataset_under = pd.DataFrame(np.concatenate([dataset_under_values, labels_under.reshape(-1,1)], axis=1), columns = col_names)
dataset_under['type'] = 'under'
dataset_over_values, labels_over = oversample.fit_resample(dataset.drop(columns=['fault_id']), dataset['fault_id'])
dataset_over = pd.DataFrame(np.concatenate([dataset_over_values, labels_over.reshape(-1,1)], axis=1), columns = col_names)
dataset_over['type'] = 'over'

dataset['type'] = 'original'

In [6]:

            
                Copied!
                
fig, ax = plt.subplots(1, 1, figsize=(18, 3), facecolor='w', edgecolor='k')
fig.suptitle('Sample count per fault_id', size=20, y=1.12)
sns.countplot(ax = ax, x='fault_id', data=pd.concat([dataset, dataset_under, dataset_over], ignore_index=True), hue='type');
fig, ax = plt.subplots(1, 1, figsize=(18, 3), facecolor='w', edgecolor='k')
fig.suptitle('Sample count per fault_id', size=20, y=1.12)
sns.countplot(ax = ax, x='fault_id', data=pd.concat([dataset, dataset_under, dataset_over], ignore_index=True), hue='type');

In [7]:

            
                Copied!
                
                    
                    
                
                

        
def scale_dataset(dataset):
    scaler = StandardScaler()
    scaler.fit(dataset.drop(columns=['fault_id', 'type']))
    dataset_scaled = pd.DataFrame(scaler.transform(dataset.drop(columns=['fault_id', 'type'])),columns=col_names[:-1])
    dataset_scaled['fault_id'] = dataset['fault_id'].copy()
    return dataset_scaled
def scale_dataset(dataset):
    scaler = StandardScaler()
    scaler.fit(dataset.drop(columns=['fault_id', 'type']))
    dataset_scaled = pd.DataFrame(scaler.transform(dataset.drop(columns=['fault_id', 'type'])),columns=col_names[:-1])
    dataset_scaled['fault_id'] = dataset['fault_id'].copy()
    return dataset_scaled

In [8]:

            
                Copied!
                
feature = 'xmeas_3'
ds = dataset.copy()

fig, ax = plt.subplots(1,2, figsize=(18, 3))
ds[[feature, 'fault_id']].boxplot(ax=ax[0], rot=90, by='fault_id', showfliers=False)
scale_dataset(ds)[[feature, 'fault_id']].boxplot(ax=ax[1], rot=90, by='fault_id', showfliers=False)
fig.suptitle("TEP dataset feature scaling boxplots", size=20, y=1.1);
feature = 'xmeas_3'
ds = dataset.copy()

fig, ax = plt.subplots(1,2, figsize=(18, 3))
ds[[feature, 'fault_id']].boxplot(ax=ax[0], rot=90, by='fault_id', showfliers=False)
scale_dataset(ds)[[feature, 'fault_id']].boxplot(ax=ax[1], rot=90, by='fault_id', showfliers=False)
fig.suptitle("TEP dataset feature scaling boxplots", size=20, y=1.1);

2. Classification pipeline¶

2.1. Pipeline¶

In [9]:

            
                Copied!
                
pipe = Pipeline([('balancer',RandomOverSampler()), 
                 ('scaler', StandardScaler()), 
                 ('classifier', RandomForestClassifier())])
#('imputer', SimpleImputer(missing_values=np.nan, strategy='mean'))
pipe = Pipeline([('balancer',RandomOverSampler()), 
                 ('scaler', StandardScaler()), 
                 ('classifier', RandomForestClassifier())])
#('imputer', SimpleImputer(missing_values=np.nan, strategy='mean'))

2.2. Results¶

In [10]:

            
                Copied!
                
                    
                    
                
                

        
X = dataset.dropna().drop(columns=['fault_id','type'])
Y = dataset.dropna()['fault_id']
scores = cross_val_score(pipe, X, Y, cv=10)
print('10-Cross-Validation scores :\n' + str(scores))

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1)
pipe.fit(X_train, Y_train)
feature_importances = pipe.named_steps['classifier'].feature_importances_
print('Classification report :\n' + str(classification_report(Y_test, pipe.predict(X_test))))
X = dataset.dropna().drop(columns=['fault_id','type'])
Y = dataset.dropna()['fault_id']
scores = cross_val_score(pipe, X, Y, cv=10)
print('10-Cross-Validation scores :\n' + str(scores))

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1)
pipe.fit(X_train, Y_train)
feature_importances = pipe.named_steps['classifier'].feature_importances_
print('Classification report :\n' + str(classification_report(Y_test, pipe.predict(X_test))))

10-Cross-Validation scores :
[0.59202756 0.70246305 0.68506654 0.52       0.31472332 0.64950495
 0.67178218 0.65923725 0.68037661 0.70223325]
Classification report :
              precision    recall  f1-score   support

           0       0.50      0.29      0.36         7
           1       0.94      0.88      0.91       130
           2       0.93      0.87      0.90       134
           3       0.43      0.66      0.52       113
           4       0.90      0.89      0.89       149
           5       0.63      0.73      0.68       105
           6       0.95      0.92      0.93       144
           7       1.00      1.00      1.00        74
           8       0.91      0.92      0.92       147
           9       0.56      0.60      0.58        93
          10       0.75      0.80      0.77       128
          11       0.86      0.68      0.76        28
          12       0.97      0.95      0.96        96
          13       1.00      0.99      0.99        97
          14       0.93      0.83      0.88       115
          15       0.54      0.45      0.49        91
          16       0.89      0.80      0.84        49
          17       0.90      0.93      0.91        68
          18       1.00      0.80      0.89        10
          19       0.83      0.77      0.80        62
          20       0.91      0.74      0.82       121
          21       0.97      0.98      0.98        63

    accuracy                           0.82      2024
   macro avg       0.83      0.79      0.81      2024
weighted avg       0.84      0.82      0.83      2024

3. Improving results¶

3.1. Results Analysis¶

In [11]:

            
                Copied!
                
ds = scale_dataset(dataset)
fault_id = 1

fig, ax = plt.subplots(1,1, figsize=(18, 3))
ds[ds['fault_id']==fault_id].drop(columns=['fault_id']).boxplot(ax=ax, rot=90, showfliers=False)
fig.suptitle('Features values for fault_id : ' + str(fault_id), size=10);
ds = scale_dataset(dataset)
fault_id = 1

fig, ax = plt.subplots(1,1, figsize=(18, 3))
ds[ds['fault_id']==fault_id].drop(columns=['fault_id']).boxplot(ax=ax, rot=90, showfliers=False)
fig.suptitle('Features values for fault_id : ' + str(fault_id), size=10);

3.2. Parameter calibration¶

In [ ]:

            
                Copied!
                
                    
                    
                
                

        
# "classifier__" string same as in pipeline
param_grid = [{'classifier__n_estimators': [100, 200, 500],
               'classifier__max_features': ['auto', 'log2'],
               'classifier__max_depth' : [5,10,50,100,None],
               'classifier__criterion' :['gini', 'entropy']}]

RF_clf_gs = GridSearchCV(pipe, param_grid=param_grid, scoring='f1_weighted',n_jobs=4, cv=10)
RF_clf_gs.fit(dataset.drop(columns=['fault_id','type']), dataset['fault_id'])
means = RF_clf_gs.cv_results_['mean_test_score']
stds = RF_clf_gs.cv_results_['std_test_score']
print('RF 10CV f1 score mean with 95% confidence interval : ')
for mean, std, params in zip(means, stds, RF_clf_gs.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
# "classifier__" string same as in pipeline
param_grid = [{'classifier__n_estimators': [100, 200, 500],
               'classifier__max_features': ['auto', 'log2'],
               'classifier__max_depth' : [5,10,50,100,None],
               'classifier__criterion' :['gini', 'entropy']}]

RF_clf_gs = GridSearchCV(pipe, param_grid=param_grid, scoring='f1_weighted',n_jobs=4, cv=10)
RF_clf_gs.fit(dataset.drop(columns=['fault_id','type']), dataset['fault_id'])
means = RF_clf_gs.cv_results_['mean_test_score']
stds = RF_clf_gs.cv_results_['std_test_score']
print('RF 10CV f1 score mean with 95% confidence interval : ')
for mean, std, params in zip(means, stds, RF_clf_gs.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

3.3. Feature Engineering¶

3.3.1. Feature Selection¶

3.3.1.1. Feature importance (RandomForest, XGBoost)¶

In [79]:

            
                Copied!
                
                    
                    
                
                

        
feature_names = [col_names[i] for i in np.argsort(feature_importances)]

fig, ax = plt.subplots(1, 1, figsize=(16, 5))
fig.suptitle('Sorted features by their importance in Random Forest classifier', size=20)
ax.bar(np.arange(len(feature_importances)), np.sort(feature_importances)[::-1])
ax.tick_params(labelsize=10)
ax.set_xlim([-0.5, len(feature_importances)-0.5])
ax.set_xticks(np.arange(len(feature_importances)))
ax.set_xticklabels(feature_names[::-1], rotation=90);
feature_names = [col_names[i] for i in np.argsort(feature_importances)]

fig, ax = plt.subplots(1, 1, figsize=(16, 5))
fig.suptitle('Sorted features by their importance in Random Forest classifier', size=20)
ax.bar(np.arange(len(feature_importances)), np.sort(feature_importances)[::-1])
ax.tick_params(labelsize=10)
ax.set_xlim([-0.5, len(feature_importances)-0.5])
ax.set_xticks(np.arange(len(feature_importances)))
ax.set_xticklabels(feature_names[::-1], rotation=90);

3.3.1.2. Feature correlation¶

In [80]:

            
                Copied!
                
                    
                    
                
                

        
def corr_heatmap(dataset, title):
    fig, ax = plt.subplots(1, 1, figsize=(10, 10))
    fig.suptitle(title, size=20);
    heatmap = ax.matshow(dataset, cmap=plt.cm.seismic)
    ax.set_xticks(range(dataset.shape[1]))
    ax.set_xticklabels(dataset.columns.values, fontsize=10, rotation=90)
    ax.set_yticks(range(dataset.shape[0]))
    ax.set_yticklabels(dataset.index.values, fontsize=10)

    ax.set_xticks(np.arange(-.5, dataset.shape[1]-1, 1), minor=True);
    ax.set_yticks(np.arange(-.5, dataset.shape[0]-1, 1), minor=True);
    ax.grid(which='minor', color='w', linestyle='-', linewidth=2.)

    # create an axes on the right side of ax. The width of cax will be 5%
    # of ax and the padding between cax and ax will be fixed at 0.05 inch.
    divider = make_axes_locatable(ax)
    cax = divider.append_axes("right", size="2%", pad=0.1)

    cb = fig.colorbar(heatmap, cax=cax)
    cb.ax.tick_params(labelsize=12)
    
# do not correlate fault_ids and type :)
ds = dataset.copy()
dataset_corr = ds.drop(columns=['fault_id', 'type']).corr()
corr_heatmap(dataset_corr, 'Correlation matrix');
def corr_heatmap(dataset, title):
    fig, ax = plt.subplots(1, 1, figsize=(10, 10))
    fig.suptitle(title, size=20);
    heatmap = ax.matshow(dataset, cmap=plt.cm.seismic)
    ax.set_xticks(range(dataset.shape[1]))
    ax.set_xticklabels(dataset.columns.values, fontsize=10, rotation=90)
    ax.set_yticks(range(dataset.shape[0]))
    ax.set_yticklabels(dataset.index.values, fontsize=10)

    ax.set_xticks(np.arange(-.5, dataset.shape[1]-1, 1), minor=True);
    ax.set_yticks(np.arange(-.5, dataset.shape[0]-1, 1), minor=True);
    ax.grid(which='minor', color='w', linestyle='-', linewidth=2.)

    # create an axes on the right side of ax. The width of cax will be 5%
    # of ax and the padding between cax and ax will be fixed at 0.05 inch.
    divider = make_axes_locatable(ax)
    cax = divider.append_axes("right", size="2%", pad=0.1)

    cb = fig.colorbar(heatmap, cax=cax)
    cb.ax.tick_params(labelsize=12)
    
# do not correlate fault_ids and type :)
ds = dataset.copy()
dataset_corr = ds.drop(columns=['fault_id', 'type']).corr()
corr_heatmap(dataset_corr, 'Correlation matrix');

3.3.1.3. Feature correlation - Improved¶

In [81]:

            
                Copied!
                
                    
                    
                
                

        
ds = dataset.copy()
dataset_corr = ds.drop(columns=['fault_id', 'type']).corr()
correlation_threshold = 0.8

upper = dataset_corr.where(np.triu(np.ones(dataset_corr.shape), k = 1).astype(np.bool))
# Select the features with correlations above the threshold
# absolute value - positive/negative correlation
to_drop = [column for column in upper.columns if any(upper[column].abs() > correlation_threshold)]

# Dataframe to hold correlated pairs
dataset_collinear = pd.DataFrame(columns = ['drop_feature', 'corr_feature', 'corr_value'])

# Iterate through the "columns to drop" to record pairs of correlated features
for column in to_drop:

    # Find the correlated features
    corr_features = list(upper.index[upper[column].abs() > correlation_threshold])

    # Find the correlated values
    corr_values = list(upper[column][upper[column].abs() > correlation_threshold])
    drop_features = [column for _ in range(len(corr_features))]    

    # Record the information (need a temp df for now)
    temp_df = pd.DataFrame.from_dict({'drop_feature': drop_features,
                                     'corr_feature': corr_features,
                                     'corr_value': corr_values})
    dataset_collinear = dataset_collinear.append(temp_df, ignore_index = True)
    
dataset_corr_filtered = dataset_corr.loc[list(set(dataset_collinear['corr_feature'])), list(set(dataset_collinear['drop_feature']))]

corr_heatmap(dataset_corr_filtered, 'Correlation matrix above threshold = ' + str(correlation_threshold));
ds = dataset.copy()
dataset_corr = ds.drop(columns=['fault_id', 'type']).corr()
correlation_threshold = 0.8

upper = dataset_corr.where(np.triu(np.ones(dataset_corr.shape), k = 1).astype(np.bool))
# Select the features with correlations above the threshold
# absolute value - positive/negative correlation
to_drop = [column for column in upper.columns if any(upper[column].abs() > correlation_threshold)]

# Dataframe to hold correlated pairs
dataset_collinear = pd.DataFrame(columns = ['drop_feature', 'corr_feature', 'corr_value'])

# Iterate through the "columns to drop" to record pairs of correlated features
for column in to_drop:

    # Find the correlated features
    corr_features = list(upper.index[upper[column].abs() > correlation_threshold])

    # Find the correlated values
    corr_values = list(upper[column][upper[column].abs() > correlation_threshold])
    drop_features = [column for _ in range(len(corr_features))]    

    # Record the information (need a temp df for now)
    temp_df = pd.DataFrame.from_dict({'drop_feature': drop_features,
                                     'corr_feature': corr_features,
                                     'corr_value': corr_values})
    dataset_collinear = dataset_collinear.append(temp_df, ignore_index = True)
    
dataset_corr_filtered = dataset_corr.loc[list(set(dataset_collinear['corr_feature'])), list(set(dataset_collinear['drop_feature']))]

corr_heatmap(dataset_corr_filtered, 'Correlation matrix above threshold = ' + str(correlation_threshold));

3.3.1.4. Feature variance(entropy)¶

In [82]:

            
                Copied!
                
fig, ax = plt.subplots(1, 1, figsize=(16, 5))
fig.suptitle('Sorted features by their variance', size=20)
ds.drop(columns=['fault_id', 'type']).std().sort_values(ascending=False).plot.bar();
fig, ax = plt.subplots(1, 1, figsize=(16, 5))
fig.suptitle('Sorted features by their variance', size=20)
ds.drop(columns=['fault_id', 'type']).std().sort_values(ascending=False).plot.bar();

3.3.1.5. Other feature selection methods¶

http://featureselection.asu.edu/
http://featureselection.asu.edu/algorithms.php

In [3]:

            
                Copied!
                
ds = dataset.copy()[:100]
ds = dataset.copy()[:100]

In [6]:

            
                Copied!
                
                    
                    
                
                

        
ds = dataset.copy()[:100]
dataset_lap_score = lap_score.lap_score(ds.drop(columns=['fault_id']).values)
#dataset_lap_score = lap_score.lap_score(ds.drop(columns=['fault_id', 'type']).values)

fig, ax = plt.subplots(1, 1, figsize=(16, 5))
ax.bar(np.arange(len(dataset_lap_score)), np.sort(dataset_lap_score)[::-1])
ax.tick_params(labelsize=10)
ax.set_xlim([-0.5, len(ds)-0.5])
ax.set_xticks(np.arange(len(dataset_lap_score)))
ax.set_xticklabels([col_names[i] for i in np.argsort(dataset_lap_score)][::-1], rotation=90)
ds = dataset.copy()[:100]
dataset_lap_score = lap_score.lap_score(ds.drop(columns=['fault_id']).values)
#dataset_lap_score = lap_score.lap_score(ds.drop(columns=['fault_id', 'type']).values)

fig, ax = plt.subplots(1, 1, figsize=(16, 5))
ax.bar(np.arange(len(dataset_lap_score)), np.sort(dataset_lap_score)[::-1])
ax.tick_params(labelsize=10)
ax.set_xlim([-0.5, len(ds)-0.5])
ax.set_xticks(np.arange(len(dataset_lap_score)))
ax.set_xticklabels([col_names[i] for i in np.argsort(dataset_lap_score)][::-1], rotation=90)

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-6-8bf262a4e9c5> in <module>
      1 ds = dataset.copy()[:100]
----> 2 dataset_lap_score = lap_score.lap_score(ds.drop(columns=['fault_id']).values)
      3 #dataset_lap_score = lap_score.lap_score(ds.drop(columns=['fault_id', 'type']).values)
      4 
      5 fig, ax = plt.subplots(1, 1, figsize=(16, 5))

C:\ProgramData\Anaconda3\lib\site-packages\skfeature\function\similarity_based\lap_score.py in lap_score(X, **kwargs)
     34         W = construct_W(X)
     35     # construct the affinity matrix W
---> 36     W = kwargs['W']
     37     # build the diagonal D matrix from affinity matrix W
     38     D = np.array(W.sum(axis=1))

KeyError: 'W'

3.3.2. Secondary feature creation¶

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rolling.html

In [10]:

            
                Copied!
                
#quantile(), std(), mean(), median(), min(), max()
dataset['new'] = dataset['xmeas_1'].rolling(10).mean()
# .diff()
#quantile(), std(), mean(), median(), min(), max()
dataset['new'] = dataset['xmeas_1'].rolling(10).mean()
# .diff()