Imports¶

This notebook uses preprocessed dataset by following notebook.

notes

CPU monitoring in terminal:
```
top
```

GPU monitoring in terminal:

pip install gpustat
watch -c gpustat -cp --color

In [1]:

            
                Copied!
                
                    
                    
                
                

        
import sys
import pandas as pd

# to save results to data directory
module_path = ".."
if module_path not in sys.path:
    sys.path.insert(1, module_path)
# increase displayed columns in jupyter notebook
pd.set_option("display.max_columns", 200)
pd.set_option("display.max_rows", 300)
import sys
import pandas as pd

# to save results to data directory
module_path = ".."
if module_path not in sys.path:
    sys.path.insert(1, module_path)
# increase displayed columns in jupyter notebook
pd.set_option("display.max_columns", 200)
pd.set_option("display.max_rows", 300)

In [7]:

            
                Copied!
                
                    
                    
                
                

        
import re
import tracemalloc
from copy import copy
from datetime import datetime
from time import time
from typing import Union

import dill
import lightgbm as lgbm
import lime
import matplotlib.pyplot as plt
import numpy as np
import sklearn
from eli5 import explain_prediction_df, explain_weights, explain_weights_df
from eli5.sklearn import PermutationImportance
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from lime.lime_tabular import LimeTabularExplainer
from pytorch_widedeep import Tab2Vec
from pytorch_widedeep.utils import LabelEncoder
from sklearn.metrics import classification_report, log_loss, mean_squared_error
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import PowerTransformer
import src
from src import common

tracemalloc.start()

import tracemalloc

import ray
from ray import tune
from ray.tune import JupyterNotebookReporter
from ray.tune.integration.lightgbm import TuneReportCheckpointCallback
from ray.tune.integration.wandb import WandbLogger
from ray.tune.logger import DEFAULT_LOGGERS
from ray.tune.schedulers import AsyncHyperBandScheduler

tracemalloc.start()

# temporarily remove deprecation warnings
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)
import re
import tracemalloc
from copy import copy
from datetime import datetime
from time import time
from typing import Union

import dill
import lightgbm as lgbm
import lime
import matplotlib.pyplot as plt
import numpy as np
import sklearn
from eli5 import explain_prediction_df, explain_weights, explain_weights_df
from eli5.sklearn import PermutationImportance
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from lime.lime_tabular import LimeTabularExplainer
from pytorch_widedeep import Tab2Vec
from pytorch_widedeep.utils import LabelEncoder
from sklearn.metrics import classification_report, log_loss, mean_squared_error
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import PowerTransformer
import src
from src import common

tracemalloc.start()

import tracemalloc

import ray
from ray import tune
from ray.tune import JupyterNotebookReporter
from ray.tune.integration.lightgbm import TuneReportCheckpointCallback
from ray.tune.integration.wandb import WandbLogger
from ray.tune.logger import DEFAULT_LOGGERS
from ray.tune.schedulers import AsyncHyperBandScheduler

tracemalloc.start()

# temporarily remove deprecation warnings
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)

Dataset¶

identifiers

In [8]:

            
                Copied!
                
                    
                    
                
                

        
column_types = common.json_load("#datasets/Colab_PowerConverter/column_types.json")
target = column_types["target"]
measurement_label = column_types["measurement_label"]
RANDOM_STATE = 1
TEST_SIZE_TRAIN = 0.2
TEST_SIZE_VALID = 0.5
EMBEDDING = False
TASK = "multiclass" #(or "binary")
column_types = common.json_load("#datasets/Colab_PowerConverter/column_types.json")
target = column_types["target"]
measurement_label = column_types["measurement_label"]
RANDOM_STATE = 1
TEST_SIZE_TRAIN = 0.2
TEST_SIZE_VALID = 0.5
EMBEDDING = False
TASK = "multiclass" #(or "binary")

In [9]:

            
                Copied!
                
df = pd.read_pickle("#datasets/Colab_PowerConverter/dataset.pkl")
df = pd.read_pickle("#datasets/Colab_PowerConverter/dataset.pkl")

In [10]:

            
                Copied!
                
# this measurement did not have a fault (?)
df = df[df[measurement_label]!="Single-Phase_Sensor_Fault"]
df.reset_index(inplace=True, drop=True)
# this measurement did not have a fault (?)
df = df[df[measurement_label]!="Single-Phase_Sensor_Fault"]
df.reset_index(inplace=True, drop=True)

In [11]:

            
                Copied!
                
fault_dict = {}
for label,i in zip(df[measurement_label].unique(), range(len(df[measurement_label].unique()))):
    df.loc[(df[measurement_label]==label) & (df[target]==1), target] = int(i+1)
    fault_dict[label] = int(i+1)
fault_dict = {}
for label,i in zip(df[measurement_label].unique(), range(len(df[measurement_label].unique()))):
    df.loc[(df[measurement_label]==label) & (df[target]==1), target] = int(i+1)
    fault_dict[label] = int(i+1)

In [12]:

            
                Copied!
                
# imbalance of the classes
df[target].value_counts()
# imbalance of the classes
df[target].value_counts()

Out[12]:

0     597599
5      40014
3      40001
6      40001
7      40001
8      40001
9      40001
10     40001
11     40001
13     40001
1      38971
2      38971
4       3166
12      1335
Name: fault, dtype: int64

In [13]:

            
                Copied!
                
fault_dict
fault_dict

Out[13]:

{'Damping-320': 1,
 'Damping-32000': 2,
 'Inertia-1.2': 3,
 'LL_Fault': 4,
 'Three-Phase_Sensor_Fault': 5,
 'Weak_Grid-4_5_mH': 6,
 'Weak_Grid-1_5_mH': 7,
 'Damping-3200': 8,
 'Inertia-0.2': 9,
 'Inertia-2': 10,
 'Single_Phase_Sag': 11,
 'Three_Phase_Grid_Fault': 12,
 'Weak_Grid-7_5_mH': 13}

Preprocessing¶

In [14]:

            
                Copied!
                
df.drop(columns=[measurement_label], inplace=True)
df.drop(columns=[measurement_label], inplace=True)

In [15]:

            
                Copied!
                
df_train, df_valid = train_test_split(df, test_size=TEST_SIZE_TRAIN, stratify=df[target], random_state=RANDOM_STATE)
df_valid, df_test = train_test_split(df_valid, test_size=TEST_SIZE_VALID, stratify=df_valid[target], random_state=RANDOM_STATE)

df_train.reset_index(inplace=True, drop=True)
df_valid.reset_index(inplace=True, drop=True)
df_test.reset_index(inplace=True, drop=True)
df_train, df_valid = train_test_split(df, test_size=TEST_SIZE_TRAIN, stratify=df[target], random_state=RANDOM_STATE)
df_valid, df_test = train_test_split(df_valid, test_size=TEST_SIZE_VALID, stratify=df_valid[target], random_state=RANDOM_STATE)

df_train.reset_index(inplace=True, drop=True)
df_valid.reset_index(inplace=True, drop=True)
df_test.reset_index(inplace=True, drop=True)

In [16]:

            
                Copied!
                
df_train_scaled, Scaler = common.scale(df_train, [target], scaler_sk='Standard')
df_valid_scaled, Scaler = common.scale(df_valid, [target], scaler_sk=Scaler)
df_test_scaled, Scaler = common.scale(df_test, [target], scaler_sk=Scaler)
df_train_scaled, Scaler = common.scale(df_train, [target], scaler_sk='Standard')
df_valid_scaled, Scaler = common.scale(df_valid, [target], scaler_sk=Scaler)
df_test_scaled, Scaler = common.scale(df_test, [target], scaler_sk=Scaler)

Categorical features tranformation¶

In [18]:

            
                Copied!
                
                    
                    
                
                

        
if EMBEDDING:
    CAT_FEATURE_TRANSFORMATION = "Entity Embedding"
    if CAT_FEATURE_TRANSFORMATION == "Label Encoding":
        label_encoder = LabelEncoder(cat_cols)
        label_encoder.fit(data[cat_cols])

        df_train_scaled_enc = df_train_scaled.copy()
        df_valid_scaled_enc = df_valid_scaled.copy()
        df_test_scaled_enc = df_test_scaled.copy()

        df_train_scaled_enc[cat_cols] = label_encoder.transform(
            df_train_scaled_enc[cat_cols]
        )
        df_valid_scaled_enc[cat_cols] = label_encoder.transform(
            df_valid_scaled_enc[cat_cols]
        )
        df_test_scaled_enc[cat_cols] = label_encoder.transform(
            df_test_scaled_enc[cat_cols]
        )
        df_test_scaled_enc[cat_cols].head()

    if CAT_FEATURE_TRANSFORMATION == "Entity Embedding":
        # using pretrained embedding from pytorch-widedeep model and its tab_preprocessor
        with open("dl_entity_emb_model_" + TASK + ".dill", "rb") as f:
            model = dill.load(f)
        with open("dl_entity_emb_model_tab_preprocessor_" + TASK + ".dill", "rb") as f:
            tab_preprocessor = dill.load(f)

        t2v = Tab2Vec(model=model, tab_preprocessor=tab_preprocessor, return_dataframe=True)
        df_train_scaled_enc, df_train_y = t2v.transform(
            df_train_scaled, target_col=target_col
        )
        df_valid_scaled_enc, df_valid_y = t2v.transform(
            df_valid_scaled, target_col=target_col
        )
        df_test_scaled_enc, df_test_y = t2v.transform(
            df_test_scaled, target_col=target_col
        )
        df_train_scaled_enc[target_col] = df_train_y
        df_valid_scaled_enc[target_col] = df_valid_y
        df_test_scaled_enc[target_col] = df_test_y

        cols_list = list(df_test_scaled_enc.columns)
        cat_cols_emb = []
        for cat_col in cat_cols:
            r = re.compile(cat_col + "*")
            cat_cols_emb.extend(list(filter(r.match, cols_list)))
    # df_test_scaled_enc[cat_cols_emb].head()
else:
    df_train_scaled_enc = df_train_scaled.copy()
    df_valid_scaled_enc = df_valid_scaled.copy()
    df_test_scaled_enc = df_test_scaled.copy()
if EMBEDDING:
    CAT_FEATURE_TRANSFORMATION = "Entity Embedding"
    if CAT_FEATURE_TRANSFORMATION == "Label Encoding":
        label_encoder = LabelEncoder(cat_cols)
        label_encoder.fit(data[cat_cols])

        df_train_scaled_enc = df_train_scaled.copy()
        df_valid_scaled_enc = df_valid_scaled.copy()
        df_test_scaled_enc = df_test_scaled.copy()

        df_train_scaled_enc[cat_cols] = label_encoder.transform(
            df_train_scaled_enc[cat_cols]
        )
        df_valid_scaled_enc[cat_cols] = label_encoder.transform(
            df_valid_scaled_enc[cat_cols]
        )
        df_test_scaled_enc[cat_cols] = label_encoder.transform(
            df_test_scaled_enc[cat_cols]
        )
        df_test_scaled_enc[cat_cols].head()

    if CAT_FEATURE_TRANSFORMATION == "Entity Embedding":
        # using pretrained embedding from pytorch-widedeep model and its tab_preprocessor
        with open("dl_entity_emb_model_" + TASK + ".dill", "rb") as f:
            model = dill.load(f)
        with open("dl_entity_emb_model_tab_preprocessor_" + TASK + ".dill", "rb") as f:
            tab_preprocessor = dill.load(f)

        t2v = Tab2Vec(model=model, tab_preprocessor=tab_preprocessor, return_dataframe=True)
        df_train_scaled_enc, df_train_y = t2v.transform(
            df_train_scaled, target_col=target_col
        )
        df_valid_scaled_enc, df_valid_y = t2v.transform(
            df_valid_scaled, target_col=target_col
        )
        df_test_scaled_enc, df_test_y = t2v.transform(
            df_test_scaled, target_col=target_col
        )
        df_train_scaled_enc[target_col] = df_train_y
        df_valid_scaled_enc[target_col] = df_valid_y
        df_test_scaled_enc[target_col] = df_test_y

        cols_list = list(df_test_scaled_enc.columns)
        cat_cols_emb = []
        for cat_col in cat_cols:
            r = re.compile(cat_col + "*")
            cat_cols_emb.extend(list(filter(r.match, cols_list)))
    # df_test_scaled_enc[cat_cols_emb].head()
else:
    df_train_scaled_enc = df_train_scaled.copy()
    df_valid_scaled_enc = df_valid_scaled.copy()
    df_test_scaled_enc = df_test_scaled.copy()

LightGBM¶

In [19]:

            
                Copied!
                
# df_train_scaled_enc = df_train_scaled_enc.sample(100000)
# df_valid_scaled_enc = df_valid_scaled_enc.sample(30000)
# df_train_scaled_enc = df_train_scaled_enc.sample(100000)
# df_valid_scaled_enc = df_valid_scaled_enc.sample(30000)

In [21]:

            
                Copied!
                
NUM_CLASSES = df[target].nunique()
NUM_CLASSES
NUM_CLASSES = df[target].nunique()
NUM_CLASSES

Out[21]:

Prepare Dataset, metric and objective functions¶

In [51]:

            
                Copied!
                
                    
                    
                
                

        
config = {}
if TASK == "binary" or TASK == "multiclass":
    config["objective"] = TASK
    config["num_classes"] = NUM_CLASSES

if TASK == "multiclass":
    ray_metric = "multi_logloss"

if EMBEDDING:
    if CAT_FEATURE_TRANSFORMATION == "Label Encoding":
        lgb_cat_cols = cat_cols_f
    if CAT_FEATURE_TRANSFORMATION == "Entity Embedding":
        lgb_cat_cols = []
else:
    lgb_cat_cols = []

lgbtrain = lgbm.Dataset(
    df_train_scaled_enc.drop(columns=[target]),
    df_train_scaled_enc[target],
    categorical_feature=lgb_cat_cols,
    free_raw_data=False,
)
lgbvalid = lgbm.Dataset(
    df_valid_scaled_enc.drop(columns=[target]),
    df_valid_scaled_enc[target],
    reference=lgbtrain,
    free_raw_data=False,
)
# Final TRAIN/TEST
ftrain = pd.concat([df_train_scaled_enc, df_valid_scaled_enc]).reset_index(
    drop=True
)
flgbtrain = lgbm.Dataset(
    ftrain.drop(columns=[target]),
    ftrain[target],
    categorical_feature=lgb_cat_cols,
    free_raw_data=False,
)
lgbtest = lgbm.Dataset(
    df_test_scaled_enc.drop(columns=[target]),
    df_test_scaled_enc[target],
    categorical_feature=lgb_cat_cols,
    reference=flgbtrain,
    free_raw_data=False,
)
config = {}
if TASK == "binary" or TASK == "multiclass":
    config["objective"] = TASK
    config["num_classes"] = NUM_CLASSES

if TASK == "multiclass":
    ray_metric = "multi_logloss"

if EMBEDDING:
    if CAT_FEATURE_TRANSFORMATION == "Label Encoding":
        lgb_cat_cols = cat_cols_f
    if CAT_FEATURE_TRANSFORMATION == "Entity Embedding":
        lgb_cat_cols = []
else:
    lgb_cat_cols = []

lgbtrain = lgbm.Dataset(
    df_train_scaled_enc.drop(columns=[target]),
    df_train_scaled_enc[target],
    categorical_feature=lgb_cat_cols,
    free_raw_data=False,
)
lgbvalid = lgbm.Dataset(
    df_valid_scaled_enc.drop(columns=[target]),
    df_valid_scaled_enc[target],
    reference=lgbtrain,
    free_raw_data=False,
)
# Final TRAIN/TEST
ftrain = pd.concat([df_train_scaled_enc, df_valid_scaled_enc]).reset_index(
    drop=True
)
flgbtrain = lgbm.Dataset(
    ftrain.drop(columns=[target]),
    ftrain[target],
    categorical_feature=lgb_cat_cols,
    free_raw_data=False,
)
lgbtest = lgbm.Dataset(
    df_test_scaled_enc.drop(columns=[target]),
    df_test_scaled_enc[target],
    categorical_feature=lgb_cat_cols,
    reference=flgbtrain,
    free_raw_data=False,
)

Train model¶

In [26]:

            
                Copied!
                
                    
                    
                
                

        
%%time
model = lgbm.train(
    config,
    flgbtrain,
    valid_sets=[lgbvalid],
    valid_names=[""],
    #feval=feval,
    #fobj=fobj,
    #callbacks=[log_evaluation()],
    )
%%time
model = lgbm.train(
    config,
    flgbtrain,
    valid_sets=[lgbvalid],
    valid_names=[""],
    #feval=feval,
    #fobj=fobj,
    #callbacks=[log_evaluation()],
    )

/home/palo/miniconda3/lib/python3.8/site-packages/lightgbm/basic.py:2065: UserWarning: Using categorical_feature in Dataset.
  _log_warning('Using categorical_feature in Dataset.')
/home/palo/miniconda3/lib/python3.8/site-packages/lightgbm/basic.py:2068: UserWarning: categorical_feature in Dataset is overridden.
New categorical_feature is []
  _log_warning('categorical_feature in Dataset is overridden.\n'

[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007216 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2295
[LightGBM] [Info] Number of data points in the train set: 936057, number of used features: 9
[LightGBM] [Info] Start training from score -0.554117
[LightGBM] [Info] Start training from score -3.284216
[LightGBM] [Info] Start training from score -3.284216
[LightGBM] [Info] Start training from score -3.258130
[LightGBM] [Info] Start training from score -5.794357
[LightGBM] [Info] Start training from score -3.257824
[LightGBM] [Info] Start training from score -3.258130
[LightGBM] [Info] Start training from score -3.258130
[LightGBM] [Info] Start training from score -3.258157
[LightGBM] [Info] Start training from score -3.258157
[LightGBM] [Info] Start training from score -3.258130
[LightGBM] [Info] Start training from score -3.258130
[LightGBM] [Info] Start training from score -6.657690
[LightGBM] [Info] Start training from score -3.258130
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[1]	's multi_logloss: 0.977783
[2]	's multi_logloss: 0.815664
[3]	's multi_logloss: 0.698939
[4]	's multi_logloss: 0.625425
[5]	's multi_logloss: 0.543057
[6]	's multi_logloss: 0.484832
[7]	's multi_logloss: 0.446497
[8]	's multi_logloss: 0.39927
[9]	's multi_logloss: 0.367568
[10]	's multi_logloss: 0.340573
[11]	's multi_logloss: 0.319405
[12]	's multi_logloss: 0.296019
[13]	's multi_logloss: 0.279713
[14]	's multi_logloss: 0.266099
[15]	's multi_logloss: 0.267354
[16]	's multi_logloss: 0.255597
[17]	's multi_logloss: 0.252282
[18]	's multi_logloss: 0.235086
[19]	's multi_logloss: 0.237021
[20]	's multi_logloss: 0.230893
[21]	's multi_logloss: 0.235814
[22]	's multi_logloss: 0.229844
[23]	's multi_logloss: 0.253783
[24]	's multi_logloss: 0.243563
[25]	's multi_logloss: 0.245112
[26]	's multi_logloss: 0.251267
[27]	's multi_logloss: 0.237238
[28]	's multi_logloss: 0.252955
[29]	's multi_logloss: 0.233645
[30]	's multi_logloss: 0.25831
[31]	's multi_logloss: 0.298165
[32]	's multi_logloss: 0.295286
[33]	's multi_logloss: 0.27396
[34]	's multi_logloss: 0.260525
[35]	's multi_logloss: 0.292285
[36]	's multi_logloss: 0.376081
[37]	's multi_logloss: 0.330623
[38]	's multi_logloss: 0.442323
[39]	's multi_logloss: 0.430259
[40]	's multi_logloss: 0.390212
[41]	's multi_logloss: 0.466535
[42]	's multi_logloss: 0.564834
[43]	's multi_logloss: 0.696845
[44]	's multi_logloss: 0.49243
[45]	's multi_logloss: 0.600275
[46]	's multi_logloss: 0.598877
[47]	's multi_logloss: 0.446685
[48]	's multi_logloss: 0.490207
[49]	's multi_logloss: 0.413613
[50]	's multi_logloss: 0.4442
[51]	's multi_logloss: 0.523439
[52]	's multi_logloss: 0.519875
[53]	's multi_logloss: 0.632309
[54]	's multi_logloss: 0.672342
[55]	's multi_logloss: 0.602289
[56]	's multi_logloss: 0.564768
[57]	's multi_logloss: 0.529735
[58]	's multi_logloss: 0.689203
[59]	's multi_logloss: 2.90619
[60]	's multi_logloss: 0.622281
[61]	's multi_logloss: 0.841785
[62]	's multi_logloss: 0.72034
[63]	's multi_logloss: 0.759707
[64]	's multi_logloss: 0.840372
[65]	's multi_logloss: 0.748165
[66]	's multi_logloss: 0.82118
[67]	's multi_logloss: 1.46753
[68]	's multi_logloss: 1.55493
[69]	's multi_logloss: 1.73801
[70]	's multi_logloss: 1.40299
[71]	's multi_logloss: 3.806
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[72]	's multi_logloss: 2.32776
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[73]	's multi_logloss: 2.76761
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[74]	's multi_logloss: 1.50927
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[75]	's multi_logloss: 1.56427
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[76]	's multi_logloss: 1.5167
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[77]	's multi_logloss: 1.4302
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[78]	's multi_logloss: 1.52527
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[79]	's multi_logloss: 1.54124
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[80]	's multi_logloss: 1.48194
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[81]	's multi_logloss: 1.98842
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[82]	's multi_logloss: 2.02313
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[83]	's multi_logloss: 3.80615
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[84]	's multi_logloss: 2.92285
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[85]	's multi_logloss: 2.73017
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[86]	's multi_logloss: 2.81455
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[87]	's multi_logloss: 2.67801
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[88]	's multi_logloss: 3.59612
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[89]	's multi_logloss: 2.67073
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[90]	's multi_logloss: 2.70365
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[91]	's multi_logloss: 2.6407
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[92]	's multi_logloss: 2.73322
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[93]	's multi_logloss: 2.74708
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[94]	's multi_logloss: 3.23696
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[95]	's multi_logloss: 3.33012
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[96]	's multi_logloss: 3.47458
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[97]	's multi_logloss: 3.88501
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[98]	's multi_logloss: 4.22404
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[99]	's multi_logloss: 4.21596
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[100]	's multi_logloss: 4.65451
CPU times: user 5min 27s, sys: 3.14 s, total: 5min 30s
Wall time: 42.4 s

Prediction & Evaluation¶

In [49]:

            
                Copied!
                
if TASK == "binary":
    res = np.rint(model.predict(lgbtest.data))

if TASK == "multiclass":
    res = model.predict(lgbtest.data).argmax(1)

result = pd.DataFrame({"predicted": res,
                       "ground_truth": df_test[target].values,})
if TASK == "binary":
    res = np.rint(model.predict(lgbtest.data))

if TASK == "multiclass":
    res = model.predict(lgbtest.data).argmax(1)

result = pd.DataFrame({"predicted": res,
                       "ground_truth": df_test[target].values,})

In [50]:

            
                Copied!
                
print('Classification report:\n{}'.format(classification_report(result['predicted'], result['ground_truth'])))
print('Classification report:\n{}'.format(classification_report(result['predicted'], result['ground_truth'])))

Classification report:
              precision    recall  f1-score   support

           0       0.99      0.98      0.99     60371
           1       0.71      0.75      0.73      3681
           2       0.71      0.77      0.74      3597
           3       0.34      0.40      0.37      3457
           4       0.00      0.00      0.00        19
           5       0.95      0.96      0.95      3959
           6       0.88      0.51      0.65      6840
           7       0.69      0.83      0.75      3318
           8       0.35      0.34      0.34      4137
           9       0.33      0.37      0.35      3511
          10       0.37      0.40      0.38      3736
          11       0.84      0.90      0.87      3753
          12       0.12      0.15      0.13       109
          13       0.79      0.90      0.84      3519

    accuracy                           0.84    104007
   macro avg       0.58      0.59      0.58    104007
weighted avg       0.85      0.84      0.84    104007

w RayTune¶

In [54]:

            
                Copied!
                
                    
                    
                
                

        
start = time()

#config["eta"] = tune.loguniform(1e-4, 1e-1),
#config["subsample"] = tune.uniform(0.5, 1.0),
config["max_depth"] = tune.randint(1, 9),
# config["wandb"]["project"] = "GBM_classifier",
# config["wandb"]["api_key_file"] = "../data/wandb_api.key",
# config["wandb"]["log_config"] = True


def training_function(config, train, valid):
    lgbm_config = config.copy()
    #lgbm_config.pop("wandb")
    trainer = lgbm.train(
        lgbm_config,
        train,
        valid_sets=[valid],
        valid_names=[""],
        callbacks=[
            TuneReportCheckpointCallback(
                {
                    ray_metric: ray_metric,
                }
            )
        ],
    )


asha_scheduler = AsyncHyperBandScheduler(
    time_attr="training_iteration",
    metric=ray_metric,
    mode="min",
    max_t=100,
    grace_period=10,
    reduction_factor=3,
    brackets=1,
)

analysis = tune.run(
    tune.with_parameters(training_function, train=lgbtrain, valid=lgbvalid),
    # resources_per_trial={"cpu": 4, "gpu": 0},
    num_samples=2,
    progress_reporter=JupyterNotebookReporter(overwrite=True),
    scheduler=asha_scheduler,
    config=config,
    #loggers=DEFAULT_LOGGERS + (WandbLogger,),
)
start = time()

#config["eta"] = tune.loguniform(1e-4, 1e-1),
#config["subsample"] = tune.uniform(0.5, 1.0),
config["max_depth"] = tune.randint(1, 9),
# config["wandb"]["project"] = "GBM_classifier",
# config["wandb"]["api_key_file"] = "../data/wandb_api.key",
# config["wandb"]["log_config"] = True


def training_function(config, train, valid):
    lgbm_config = config.copy()
    #lgbm_config.pop("wandb")
    trainer = lgbm.train(
        lgbm_config,
        train,
        valid_sets=[valid],
        valid_names=[""],
        callbacks=[
            TuneReportCheckpointCallback(
                {
                    ray_metric: ray_metric,
                }
            )
        ],
    )


asha_scheduler = AsyncHyperBandScheduler(
    time_attr="training_iteration",
    metric=ray_metric,
    mode="min",
    max_t=100,
    grace_period=10,
    reduction_factor=3,
    brackets=1,
)

analysis = tune.run(
    tune.with_parameters(training_function, train=lgbtrain, valid=lgbvalid),
    # resources_per_trial={"cpu": 4, "gpu": 0},
    num_samples=2,
    progress_reporter=JupyterNotebookReporter(overwrite=True),
    scheduler=asha_scheduler,
    config=config,
    #loggers=DEFAULT_LOGGERS + (WandbLogger,),
)

== Status ==
Current time: 2021-11-08 10:38:03 (running for 00:00:01.22)
Memory usage on this node: 3.0/12.2 GiB
Using AsyncHyperBand: num_stopped=0 Bracket: Iter 90.000: None | Iter 30.000: None | Iter 10.000: None
Resources requested: 0/8 CPUs, 0/0 GPUs, 0.0/6.47 GiB heap, 0.0/3.24 GiB objects
Result logdir: /home/palo/ray_results/training_function_2021-11-08_10-38-02
Number of trials: 2/2 (2 ERROR)

Trial name	status	loc
training_function_91209_00000	ERROR	172.18.71.208:626
training_function_91209_00001	ERROR	172.18.71.208:624

Number of errored trials: 2

Trial name	# failures	error file
training_function_91209_00000	1	/home/palo/ray_results/training_function_2021-11-08_10-38-02/training_function_91209_00000_0_2021-11-08_10-38-02/error.txt
training_function_91209_00001	1	/home/palo/ray_results/training_function_2021-11-08_10-38-02/training_function_91209_00001_1_2021-11-08_10-38-02/error.txt

---------------------------------------------------------------------------
TuneError                                 Traceback (most recent call last)
/tmp/ipykernel_409/1918576441.py in <module>
     37 )
     38 
---> 39 analysis = tune.run(
     40     tune.with_parameters(training_function, train=lgbtrain, valid=lgbvalid),
     41     # resources_per_trial={"cpu": 4, "gpu": 0},

~/miniconda3/lib/python3.8/site-packages/ray/tune/tune.py in run(run_or_experiment, name, metric, mode, stop, time_budget_s, config, resources_per_trial, num_samples, local_dir, search_alg, scheduler, keep_checkpoints_num, checkpoint_score_attr, checkpoint_freq, checkpoint_at_end, verbose, progress_reporter, log_to_file, trial_name_creator, trial_dirname_creator, sync_config, export_formats, max_failures, fail_fast, restore, server_port, resume, queue_trials, reuse_actors, trial_executor, raise_on_failed_trial, callbacks, max_concurrent_trials, loggers, _remote)
    622     if incomplete_trials:
    623         if raise_on_failed_trial and not state[signal.SIGINT]:
--> 624             raise TuneError("Trials did not complete", incomplete_trials)
    625         else:
    626             logger.error("Trials did not complete: %s", incomplete_trials)

TuneError: ('Trials did not complete', [training_function_91209_00000, training_function_91209_00001])

In [40]:

            
                Copied!
                
analysis.trial_dataframes
analysis.trial_dataframes

Train best params model¶

In [ ]:

            
                Copied!
                
                    
                    
                
                

        
runtime = time() - start
print("Optimization time:\n{}".format(runtime))

params = copy(analysis.get_best_config(ray_metric, "min"))
params.pop("wandb")
# params["n_estimators"] = 1000

start = time()
model = lgbm.train(
    params,
    flgbtrain,
    valid_sets=[lgbtest],
    callbacks=[lgbm.log_evaluation(show_stdv=False)],
)
runtime = time() - start
print("Final model training time:\n{}".format(str(datetime.timedelta(seconds=runtime))))a
runtime = time() - start
print("Optimization time:\n{}".format(runtime))

params = copy(analysis.get_best_config(ray_metric, "min"))
params.pop("wandb")
# params["n_estimators"] = 1000

start = time()
model = lgbm.train(
    params,
    flgbtrain,
    valid_sets=[lgbtest],
    callbacks=[lgbm.log_evaluation(show_stdv=False)],
)
runtime = time() - start
print("Final model training time:\n{}".format(str(datetime.timedelta(seconds=runtime))))a

Tensorboard visualization¶

In [ ]:

            
                Copied!
                
from tensorboard import notebook

notebook.list()
from tensorboard import notebook

notebook.list()

In [ ]:

            
                Copied!
                
%load_ext tensorboard
%tensorboard --logdir ~/ray_results
%load_ext tensorboard
%tensorboard --logdir ~/ray_results