1. Libraries¶

In [ ]:

            
                Copied!
                
                    
                    
                
                

        
import pandas as pd
# https://xgboost.readthedocs.io/en/latest/
import xgboost
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

# https://scikit-learn.org/stable/modules/svm.html
from sklearn import svm
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html#sklearn.linear_model.LogisticRegressionCV
from sklearn.linear_model import LogisticRegression
#https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics
# defining scoring strategy:
# https://scikit-learn.org/stable/modules/model_evaluation.html#defining-your-scoring-strategy-from-metric-functions
# scoring needs to be changed with string, ie : LogisticRegressionCV(cv=10, random_state=0,multi_class='multinomial', scoring="f1_score").fit(samples, labels)
# https://scikit-learn.org/stable/modules/cross_validation.html
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
# https://xgboost.readthedocs.io/en/latest/
import xgboost
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

# https://scikit-learn.org/stable/modules/svm.html
from sklearn import svm
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html#sklearn.linear_model.LogisticRegressionCV
from sklearn.linear_model import LogisticRegression
#https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics
# defining scoring strategy:
# https://scikit-learn.org/stable/modules/model_evaluation.html#defining-your-scoring-strategy-from-metric-functions
# scoring needs to be changed with string, ie : LogisticRegressionCV(cv=10, random_state=0,multi_class='multinomial', scoring="f1_score").fit(samples, labels)
# https://scikit-learn.org/stable/modules/cross_validation.html
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

2. Dataset¶

In [ ]:

            
                Copied!
                
dataset_scaled = pd.read_csv('Tennessee_Event-Driven/datasets/dataset_standard_scaled.csv',index_col=False)
dataset_scaled = pd.read_csv('Tennessee_Event-Driven/datasets/dataset_standard_scaled.csv',index_col=False)

In [ ]:

            
                Copied!
                
samples = dataset_scaled[dataset_scaled.columns[:-1]].values
labels = dataset_scaled['fault_id'].values
samples = dataset_scaled[dataset_scaled.columns[:-1]].values
labels = dataset_scaled['fault_id'].values

In [ ]:

            
                Copied!
                
samples_train, samples_test, labels_train, labels_test = train_test_split(samples, labels, test_size=0.1)
samples_train, samples_test, labels_train, labels_test = train_test_split(samples, labels, test_size=0.1)

3. Classificators¶

In [ ]:

            
                Copied!
                
                    
                    
                
                

        
# logistic regression
# For multiclass problems, only ‘newton-cg’, ‘sag’, ‘saga’ and ‘lbfgs’ handle multinomial loss; ‘liblinear’ is limited to one-versus-rest schemes.
# solver = ?
LR_clf = LogisticRegression(multi_class='multinomial', solver='lbfgs')
# vdt
LRscores = cross_val_score(LR_clf, samples, labels, cv=10, scoring='f1_weighted')
LR_clf.fit(samples_train,labels_train)
LR_predicted = LR_clf.predict(samples_test)
print('LR 10CV f1_weighted scores : ' + str(LRscores))
print('LR classification report :\n' + str(classification_report(labels_test, LR_predicted)))
print('LR confusion matrix :\n' + str(confusion_matrix(labels_test, LR_predicted)))

# SVM
# about gamma='scale' issue : https://stackoverflow.com/questions/52582796/support-vector-regression-typeerror-must-be-real-number-not-str
SVM_clf = svm.SVC(decision_function_shape='ovo')
SVMscores = cross_val_score(SVM_clf, samples, labels, cv=10, scoring='f1_weighted')
SVM_clf.fit(samples_train,labels_train)
SVM_predicted = SVM_clf.predict(samples_test)
print('SVM 10CV f1_weighted scores : ' + str(SVMscores))
print('SVM classification report :\n' + str(classification_report(labels_test, SVM_predicted)))
print('SVM confusion matrix :\n' + str(confusion_matrix(labels_test, SVM_predicted)))

# xgboost
XGBOOST_clf = xgboost.XGBClassifier()
XGBOOSTscores = cross_val_score(XGBOOST_clf, samples, labels, cv=10, scoring='f1_weighted')
XGBOOST_clf.fit(samples_train,labels_train)
XGBOOST_predicted = XGBOOST_clf.predict(samples_test)
print('XGBOOST 10CV f1_weighted scores : ' + str(XGBOOSTscores))
print('XGBOOST classification report :\n' + str(classification_report(labels_test, XGBOOST_predicted)))
print('XGBOOST confusion matrix :\n' + str(confusion_matrix(labels_test, XGBOOST_predicted)))
print('XGBOOST features importances :\n' + str(XGBOOST_clf.feature_importances_))

# Random Forest
RF_clf = RandomForestClassifier()
RFscores = cross_val_score(RF_clf, samples, labels, cv=10, scoring='f1_weighted')
RF_clf.fit(samples_train,labels_train)
RF_predicted = RF_clf.predict(samples_test)
print('Random Forest 10CV f1_weighted scores : ' + str(RFscores))
print('Random Forest classification report :\n' + str(classification_report(labels_test, RF_predicted)))
print('Random Forest confusion matrix :\n' + str(confusion_matrix(labels_test, RF_predicted)))
print('Random Forest features importances :\n' + str(RF_clf.feature_importances_))
# logistic regression
# For multiclass problems, only ‘newton-cg’, ‘sag’, ‘saga’ and ‘lbfgs’ handle multinomial loss; ‘liblinear’ is limited to one-versus-rest schemes.
# solver = ?
LR_clf = LogisticRegression(multi_class='multinomial', solver='lbfgs')
# vdt
LRscores = cross_val_score(LR_clf, samples, labels, cv=10, scoring='f1_weighted')
LR_clf.fit(samples_train,labels_train)
LR_predicted = LR_clf.predict(samples_test)
print('LR 10CV f1_weighted scores : ' + str(LRscores))
print('LR classification report :\n' + str(classification_report(labels_test, LR_predicted)))
print('LR confusion matrix :\n' + str(confusion_matrix(labels_test, LR_predicted)))

# SVM
# about gamma='scale' issue : https://stackoverflow.com/questions/52582796/support-vector-regression-typeerror-must-be-real-number-not-str
SVM_clf = svm.SVC(decision_function_shape='ovo')
SVMscores = cross_val_score(SVM_clf, samples, labels, cv=10, scoring='f1_weighted')
SVM_clf.fit(samples_train,labels_train)
SVM_predicted = SVM_clf.predict(samples_test)
print('SVM 10CV f1_weighted scores : ' + str(SVMscores))
print('SVM classification report :\n' + str(classification_report(labels_test, SVM_predicted)))
print('SVM confusion matrix :\n' + str(confusion_matrix(labels_test, SVM_predicted)))

# xgboost
XGBOOST_clf = xgboost.XGBClassifier()
XGBOOSTscores = cross_val_score(XGBOOST_clf, samples, labels, cv=10, scoring='f1_weighted')
XGBOOST_clf.fit(samples_train,labels_train)
XGBOOST_predicted = XGBOOST_clf.predict(samples_test)
print('XGBOOST 10CV f1_weighted scores : ' + str(XGBOOSTscores))
print('XGBOOST classification report :\n' + str(classification_report(labels_test, XGBOOST_predicted)))
print('XGBOOST confusion matrix :\n' + str(confusion_matrix(labels_test, XGBOOST_predicted)))
print('XGBOOST features importances :\n' + str(XGBOOST_clf.feature_importances_))

# Random Forest
RF_clf = RandomForestClassifier()
RFscores = cross_val_score(RF_clf, samples, labels, cv=10, scoring='f1_weighted')
RF_clf.fit(samples_train,labels_train)
RF_predicted = RF_clf.predict(samples_test)
print('Random Forest 10CV f1_weighted scores : ' + str(RFscores))
print('Random Forest classification report :\n' + str(classification_report(labels_test, RF_predicted)))
print('Random Forest confusion matrix :\n' + str(confusion_matrix(labels_test, RF_predicted)))
print('Random Forest features importances :\n' + str(RF_clf.feature_importances_))

3.1. Results¶

LR 10CV f1_weighted scores :

[0.42448604 0.45865517 0.47881965 0.24104686 0.25607518 0.43588939 0.46481111 0.48854089 0.49014324 0.41764563]

LR classification report :

              precision    recall  f1-score   support

           0       0.11      0.07      0.09       135
           1       0.94      0.84      0.89       151
           2       0.88      0.81      0.84       130
           3       0.12      0.14      0.13       137
           4       0.66      0.90      0.76       153
           5       0.88      0.92      0.90       137
           6       1.00      0.89      0.94       164
           7       0.98      0.91      0.94       139
           8       0.28      0.41      0.33       151
           9       0.12      0.11      0.12       134
          10       0.54      0.39      0.45       161
          11       0.05      0.07      0.06       134
          12       0.29      0.27      0.28       138
          13       0.46      0.51      0.49       139
          14       0.03      0.01      0.02       134
          15       0.12      0.14      0.13       132
          16       0.62      0.33      0.43       138
          17       0.72      0.71      0.71       154
          18       0.87      0.81      0.84       161
          19       0.23      0.19      0.21       160
          20       0.54      0.61      0.58       147
          21       0.27      0.43      0.33       141

    accuracy                           0.49      3170
   macro avg       0.49      0.48      0.48      3170
weighted avg       0.50      0.49      0.49      3170

LR confusion matrix :

[[ 10   0   0  25   0   0   0   0  16   9   1  16   3   4   7  13   1   0    2  12   1  15]
 [  3 127   0   1   0   0   0   0   0   1   1   4   0   2   2   5   1   0    0   2   0   2]
 [  1   0 105   2   0   0   0   0   1   3   1   4   0   0   2   1   1   0    0   3   2   4]
 [  9   0   0  19   0   0   0   0  12   7   6  21   4   5   7  14   0   0    2  13   2  16]
 [  0   0   0   4 138   0   0   0   1   3   1   2   0   0   0   2   0   0    0   1   1   0]
 [  0   0   0   3   0 126   0   0   0   1   1   3   0   0   0   0   0   0    0   1   0   2]
 [  1   0   0   1   0   0 146   0   3   3   0   3   1   0   1   0   0   0    0   0   0   5]
 [  1   0   0   0   0   0   0 127   1   2   0   1   0   0   1   5   0   0    0   0   0   1]
 [  2   7  14   2   1   0   0   0  62   1   4   5  17  12   1   3   4   1    0   4  10   1]
 [  9   0   0  12   0   0   0   0  12  15   4  18   6   2   7  10   2   0    2   9   7  19]
 [  3   0   0   8   0   0   0   0   9   9  63  10  10   5   2  14   0   0    0   6   4  18]
 [  6   0   0   5  28   0   0   1  15   8   3   9   3   8   2  16   1   5    1   9   4  10]
 [  2   0   0  10   0  15   0   1  13   5   3   5  37  21   1   5   1   0    9   4   2   4]
 [  2   0   1   2   2   0   0   0  14   0   0   5  13  71   2   2  10   0    1   1   0  13]
 [  4   1   0   3  33   0   0   0   4  21   3   8   1   0   2   7   2  37    0   4   2   2]
 [ 10   0   0  14   0   0   0   1  17   7   8  10   2   8   6  19   1   0    1   7   2  19]
 [  7   0   0  12   0   0   0   0   7   4   7   6   4   9   4  10  45   0    0  11   2  10]
 [  1   0   0   1   6   0   0   0   3   6   1   6   5   0   1   3   0 109    0   4   3   5]
 [  2   0   0   5   0   3   0   0   7   1   1   4   1   0   0   4   0   0  130   1   0   2]
 [ 10   0   0  11   0   0   0   0  14   9   3   6  18   0   3  11   4   0    1  31  30   9]
 [  1   0   0   8   0   0   0   0   5   3   2  13   1   1   6   4   0   0    1   5  90   7]
 [  3   0   0   6   1   0   0   0   6   7   3  14   2   5   9  15   0   0    0   5   4  61]]

SVM 10CV f1_weighted scores :

[0.5435794  0.6021387  0.5920755  0.30431432 0.32529879 0.5764909 0.59781791 0.62112622 0.63663635 0.66662602]

SVM classification report :

              precision    recall  f1-score   support

           0       0.13      0.27      0.17       135
           1       1.00      0.83      0.91       151
           2       1.00      0.81      0.89       130
           3       0.13      0.28      0.17       137
           4       0.87      0.88      0.87       153
           5       0.96      0.88      0.92       137
           6       1.00      0.89      0.94       164
           7       1.00      0.91      0.95       139
           8       0.98      0.87      0.92       151
           9       0.14      0.24      0.18       134
          10       0.53      0.30      0.38       161
          11       0.36      0.19      0.25       134
          12       0.95      0.77      0.85       138
          13       0.97      0.85      0.90       139
          14       0.98      0.71      0.82       134
          15       0.13      0.33      0.18       132
          16       0.40      0.18      0.25       138
          17       0.88      0.75      0.81       154
          18       0.99      0.78      0.88       161
          19       0.60      0.56      0.58       160
          20       0.83      0.50      0.62       147
          21       0.93      0.27      0.42       141

    accuracy                           0.60      3170
   macro avg       0.72      0.59      0.63      3170
weighted avg       0.72      0.60      0.64      3170

SVM confusion matrix :

[[ 37   0   0  36   0   0   0   0   0  19   1   3   0   0   0  27   4   0    0   5   3   0]
 [  6 126   0   5   0   0   0   0   3   3   0   1   0   0   0   5   1   0    0   1   0   0]
 [  6   0 105   8   0   0   0   0   0   3   0   2   0   0   0   5   0   0    0   1   0   0]
 [ 22   0   0  39   0   0   0   0   0  24   2   4   0   0   0  35   2   0    0   6   2   1]
 [  3   0   0   4 135   0   0   0   0   4   0   2   0   0   0   4   0   0    0   1   0   0]
 [  2   0   0   3   0 121   0   0   0   0   0   1   0   0   0   4   1   0    0   5   0   0]
 [  3   0   0   0   0   0 146   0   0   6   0   2   0   0   0   6   0   0    0   0   0   1]
 [  1   0   0   2   0   0   0 127   0   3   1   0   0   0   0   5   0   0    0   0   0   0]
 [  4   0   0   3   0   0   0   0 132   3   1   1   2   1   0   2   2   0    0   0   0   0]
 [ 29   0   0  20   0   0   0   0   0  32   4   5   0   0   0  36   3   0    0   5   0   0]
 [ 23   0   0  25   0   0   0   0   0  13  48   4   0   1   0  25  10   0    0   7   5   0]
 [ 21   0   0  21  21   0   0   0   0  10   4  26   0   0   1  20   4   1    0   5   0   0]
 [  6   0   0   5   0   4   0   0   0   4   4   1 106   2   0   4   1   0    1   0   0   0]
 [  3   0   0   2   0   0   0   0   0   4   0   2   1 118   0   7   2   0    0   0   0   0]
 [  4   0   0   3   0   0   0   0   0   6   0   4   0   0  95   5   0  15    0   1   1   0]
 [ 34   0   0  26   0   0   0   0   0  19   1   1   0   0   0  43   2   0    0   5   1   0]
 [ 21   0   0  26   0   0   0   0   0  14  14   3   0   0   0  26  25   0    0   6   2   1]
 [  5   0   0  10   0   0   0   0   0   7   1   1   0   0   1  10   1 116    0   2   0   0]
 [  6   0   0   9   0   0   0   0   0   5   0   1   3   0   0   7   1   0  126   3   0   0]
 [ 17   0   0  22   0   0   0   0   0  17   1   1   0   0   0  11   1   0    0  90   0   0]
 [ 13   0   0  18   0   1   0   0   0  13   6   2   0   0   0  18   0   0    0   3  73   0]
 [ 22   0   0  24   0   0   0   0   0  14   2   5   0   0   0  28   2   0    0   5   1  38]]

XGBOOST 10CV f1_weighted scores :

[0.68392688 0.74462579 0.74093651 0.36879107 0.41598073 0.72445736 0.7336726  0.7663613  0.70552416 0.75007832]

XGBOOST classification report :

              precision    recall  f1-score   support

           0       0.25      0.47      0.33       135
           1       1.00      0.85      0.92       151
           2       1.00      0.83      0.91       130
           3       0.34      0.52      0.41       137
           4       0.94      0.90      0.92       153
           5       0.94      0.91      0.93       137
           6       1.00      0.89      0.94       164
           7       0.99      0.91      0.95       139
           8       0.96      0.85      0.91       151
           9       0.34      0.43      0.38       134
          10       0.75      0.71      0.73       161
          11       0.86      0.69      0.77       134
          12       0.92      0.77      0.84       138
          13       0.93      0.85      0.89       139
          14       1.00      0.86      0.92       134
          15       0.26      0.43      0.32       132
          16       0.82      0.56      0.66       138
          17       0.94      0.84      0.89       154
          18       0.98      0.81      0.89       161
          19       0.71      0.72      0.72       160
          20       0.75      0.64      0.69       147
          21       1.00      0.91      0.95       141

    accuracy                           0.75      3170
   macro avg       0.80      0.74      0.77      3170
weighted avg       0.81      0.75      0.77      3170

XGBOOST confusion matrix :

[[ 63   0   0  23   0   0   0   0   0  22   1   2   0   0   0  17   1   0    0   6   0   0]
 [ 11 129   0   2   0   0   0   0   2   1   0   0   0   0   0   5   0   0    0   1   0   0]
 [ 12   0 108   2   0   0   0   0   0   2   0   0   0   0   0   2   0   0    0   2   2   0]
 [ 17   0   0  71   0   1   0   0   0  16   2   2   0   0   0  16   2   1    1   6   2   0]
 [  3   0   0   2 137   0   0   0   0   1   0   2   0   0   0   7   0   0    0   1   0   0]
 [  5   0   0   1   0 125   0   0   0   0   0   0   0   0   0   4   0   1    0   1   0   0]
 [  8   0   0   1   0   0 146   0   0   2   0   0   0   0   0   7   0   0    0   0   0   0]
 [  5   0   0   0   0   0   0 127   0   4   0   0   0   0   0   3   0   0    0   0   0   0]
 [  4   0   0   3   0   0   0   0 129   3   1   0   4   5   0   1   0   0    0   1   0   0]
 [ 15   0   0  18   0   0   0   0   0  58   3   1   0   0   0  26   2   1    0   6   4   0]
 [  9   0   0  10   0   0   0   0   0   6 114   0   1   0   0  12   7   0    0   1   1   0]
 [ 10   0   0   7   8   0   0   0   0   7   0  93   0   0   0   5   0   1    0   1   2   0]
 [  6   0   0   2   0   2   0   1   2   1   3   0 106   4   0   9   0   0    0   1   1   0]
 [  7   0   0   1   0   0   0   0   1   1   0   0   4 118   0   4   2   0    0   1   0   0]
 [  5   0   0   3   0   0   0   0   0   2   0   0   0   0 115   4   0   4    0   1   0   0]
 [ 24   0   0  17   0   0   0   0   0  14   7   2   0   0   0  57   1   0    0   5   5   0]
 [  7   0   0  12   0   1   0   0   0   7  16   0   0   0   0  10  77   0    0   2   6   0]
 [  6   0   0   2   0   0   0   0   0   6   0   0   0   0   0   7   0 130    0   1   2   0]
 [  4   0   0   3   0   4   0   0   0   5   0   0   0   0   0   8   0   0  131   5   1   0]
 [ 13   0   0  14   0   0   0   0   0   3   0   5   0   0   0   4   1   0    0 115   5   0]
 [ 14   0   0   8   0   0   0   0   0  10   4   1   0   0   0  10   1   0    1   4  94   0]
 [  4   0   0   4   0   0   0   0   0   2   0   0   0   0   0   2   0   0    0   0   1 128]]

XGBOOST features importances :

[0.12379248 0.0047626  0.00850815 0.04373371 0.0364475  0.00078652 0.01043513 0.01692818 0.03933102 0.10539821 0.01340495 0.0005318 0.01391175 0.0010541  0.00046664 0.01039928 0.0308312  0.01555407 0.02439173 0.01242347 0.0381037  0.0296824  0.00856636 0.0043253 0.00754108 0.00109454 0.00474678 0.00864964 0.00869501 0.00915563 0.01988467 0.00742988 0.01122067 0.0132043  0.00754636 0.00789573 0.00406679 0.00796529 0.00421074 0.00391345 0.00462407 0.00291873 0.00429754 0.03122465 0.10151894 0.02033001 0.02878321 0.00032342 0.         0.01245736 0.03271366 0.03981758]

Random Forest 10CV f1_weighted scores :

[0.69059684 0.74095688 0.74139596 0.34923431 0.42912053 0.71319971 0.7265349  0.76775074 0.67521654 0.72273346]

Random Forest classification report :

              precision    recall  f1-score   support

           0       0.39      0.72      0.51       135
           1       1.00      0.89      0.94       151
           2       1.00      0.83      0.91       130
           3       0.59      0.74      0.66       137
           4       0.88      0.89      0.88       153
           5       0.86      0.85      0.85       137
           6       0.99      0.90      0.95       164
           7       1.00      0.94      0.97       139
           8       1.00      0.96      0.98       151
           9       0.52      0.65      0.58       134
          10       0.96      0.84      0.90       161
          11       0.91      0.73      0.81       134
          12       1.00      0.90      0.95       138
          13       0.99      0.92      0.96       139
          14       1.00      0.87      0.93       134
          15       0.49      0.65      0.56       132
          16       0.93      0.82      0.87       138
          17       0.94      0.88      0.91       154
          18       1.00      0.84      0.92       161
          19       0.80      0.88      0.84       160
          20       0.85      0.72      0.78       147
          21       0.99      0.91      0.95       141

    accuracy                           0.84      3170
   macro avg       0.87      0.83      0.85      3170
weighted avg       0.87      0.84      0.85      3170

Random Forest confusion matrix :

[[ 97   0   0  10   0   0   0   0   0  14   0   1   0   0   0   9   0   0    0   4   0   0]
 [  8 135   0   1   0   0   0   0   0   2   0   0   0   0   0   5   0   0    0   0   0   0]
 [ 13   0 108   1   0   0   0   0   0   1   0   0   0   0   0   6   0   0    0   1   0   0]
 [ 12   0   0 102   0   1   0   0   0   4   0   1   0   0   0   8   1   1    0   5   2   0]
 [  0   0   0   0 136  14   0   0   0   1   0   2   0   0   0   0   0   0    0   0   0   0]
 [  4   0   0   2  10 116   0   0   0   3   0   0   0   0   0   1   0   1    0   0   0   0]
 [  9   0   0   1   0   0 148   0   0   1   0   0   0   0   0   5   0   0    0   0   0   0]
 [  3   0   0   0   0   0   0 131   0   1   0   0   0   0   0   4   0   0    0   0   0   0]
 [  3   0   0   2   0   0   0   0 145   0   0   0   0   0   0   1   0   0    0   0   0   0]
 [ 13   0   0  11   0   0   0   0   0  87   0   1   0   0   0   9   2   1    0   5   4   1]
 [  9   0   0   5   0   0   0   0   0   6 136   0   0   0   0   2   2   0    0   1   0   0]
 [  7   0   0   6   9   0   0   0   0   5   0  98   0   0   0   6   0   2    0   1   0   0]
 [  6   0   0   0   0   1   0   0   0   2   0   0 124   1   0   3   0   0    0   0   1   0]
 [  6   0   0   0   0   0   0   0   0   3   0   0   0 128   0   1   0   0    0   1   0   0]
 [  6   0   0   2   0   0   0   0   0   1   0   0   0   0 117   3   0   4    0   1   0   0]
 [ 14   0   0   7   0   1   0   0   0  15   2   0   0   0   0  86   1   0    0   3   3   0]
 [  7   0   0   4   0   0   0   0   0   4   2   0   0   0   0   2 113   0    0   2   4   0]
 [  4   0   0   4   0   0   0   0   0   4   0   0   0   0   0   5   0 135    0   1   1   0]
 [  6   0   0   2   0   1   0   0   0   5   0   0   0   0   0   7   0   0  136   3   1   0]
 [  1   0   0   7   0   0   0   0   0   3   0   3   0   0   0   3   2   0    0 140   1   0]
 [ 15   0   0   7   0   1   1   0   0   4   1   2   0   0   0   6   0   0    0   4 106   0]
 [  4   0   0   0   0   0   0   0   0   2   0   0   0   0   0   3   0   0    0   2   1 129]]

Random Forest features importances :

[0.0373927  0.00810206 0.01020342 0.02006777 0.01795898 0.01199323 0.02037662 0.01540028 0.0343393  0.02601041 0.02151706 0.0064665 0.02009364 0.0069221  0.006431   0.02152803 0.0142307  0.02657733 0.03480786 0.02227191 0.04406107 0.01665294 0.01672006 0.01113766 0.01661354 0.00877211 0.01333297 0.01801175 0.01938749 0.01216083 0.01905983 0.0100468  0.01381571 0.01823266 0.01333095 0.01168955 0.01188482 0.01856537 0.01270057 0.01215902 0.01280374 0.00821088 0.01059093 0.03070568 0.06515119 0.02355875 0.02400868 0.00638174 0.00640568 0.0309878  0.06080765 0.01936067]

GridSearchCV for better parameter values¶

In [ ]:

            
                Copied!
                
                    
                    
                
                

        
param_grid = [{'n_estimators': [100, 200, 500],
               'max_features': ['auto', 'log2'],
               'max_depth' : [5,10,50,100,None],
               'criterion' :['gini', 'entropy']}]

RF_clf_gs = GridSearchCV(estimator = RandomForestClassifier(n_estimators=100), param_grid=param_grid, scoring='f1',n_jobs=4, cv=10)
RF_clf_gs.fit(samples, labels)
means = RF_clf_gs.cv_results_['mean_test_score']
stds = RF_clf_gs.cv_results_['std_test_score']
print('RF 10CV f1 score mean with 95% confidence interval : ')
for mean, std, params in zip(means, stds, RF_clf_gs.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
param_grid = [{'n_estimators': [100, 200, 500],
               'max_features': ['auto', 'log2'],
               'max_depth' : [5,10,50,100,None],
               'criterion' :['gini', 'entropy']}]

RF_clf_gs = GridSearchCV(estimator = RandomForestClassifier(n_estimators=100), param_grid=param_grid, scoring='f1',n_jobs=4, cv=10)
RF_clf_gs.fit(samples, labels)
means = RF_clf_gs.cv_results_['mean_test_score']
stds = RF_clf_gs.cv_results_['std_test_score']
print('RF 10CV f1 score mean with 95% confidence interval : ')
for mean, std, params in zip(means, stds, RF_clf_gs.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

In [ ]: