In [1]:
Copied!
import sys
import pandas as pd
# to save results to data directory
module_path = ".."
if module_path not in sys.path:
sys.path.insert(1, module_path)
# increase displayed columns in jupyter notebook
pd.set_option("display.max_columns", 200)
pd.set_option("display.max_rows", 300)
import sys
import pandas as pd
# to save results to data directory
module_path = ".."
if module_path not in sys.path:
sys.path.insert(1, module_path)
# increase displayed columns in jupyter notebook
pd.set_option("display.max_columns", 200)
pd.set_option("display.max_rows", 300)
In [7]:
Copied!
import re
import tracemalloc
from copy import copy
from datetime import datetime
from time import time
from typing import Union
import dill
import lightgbm as lgbm
import lime
import matplotlib.pyplot as plt
import numpy as np
import sklearn
from eli5 import explain_prediction_df, explain_weights, explain_weights_df
from eli5.sklearn import PermutationImportance
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from lime.lime_tabular import LimeTabularExplainer
from pytorch_widedeep import Tab2Vec
from pytorch_widedeep.utils import LabelEncoder
from sklearn.metrics import classification_report, log_loss, mean_squared_error
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import PowerTransformer
import src
from src import common
tracemalloc.start()
import tracemalloc
import ray
from ray import tune
from ray.tune import JupyterNotebookReporter
from ray.tune.integration.lightgbm import TuneReportCheckpointCallback
from ray.tune.integration.wandb import WandbLogger
from ray.tune.logger import DEFAULT_LOGGERS
from ray.tune.schedulers import AsyncHyperBandScheduler
tracemalloc.start()
# temporarily remove deprecation warnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import re
import tracemalloc
from copy import copy
from datetime import datetime
from time import time
from typing import Union
import dill
import lightgbm as lgbm
import lime
import matplotlib.pyplot as plt
import numpy as np
import sklearn
from eli5 import explain_prediction_df, explain_weights, explain_weights_df
from eli5.sklearn import PermutationImportance
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from lime.lime_tabular import LimeTabularExplainer
from pytorch_widedeep import Tab2Vec
from pytorch_widedeep.utils import LabelEncoder
from sklearn.metrics import classification_report, log_loss, mean_squared_error
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import PowerTransformer
import src
from src import common
tracemalloc.start()
import tracemalloc
import ray
from ray import tune
from ray.tune import JupyterNotebookReporter
from ray.tune.integration.lightgbm import TuneReportCheckpointCallback
from ray.tune.integration.wandb import WandbLogger
from ray.tune.logger import DEFAULT_LOGGERS
from ray.tune.schedulers import AsyncHyperBandScheduler
tracemalloc.start()
# temporarily remove deprecation warnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
Dataset¶
identifiers
In [8]:
Copied!
column_types = common.json_load("#datasets/Colab_PowerConverter/column_types.json")
target = column_types["target"]
measurement_label = column_types["measurement_label"]
RANDOM_STATE = 1
TEST_SIZE_TRAIN = 0.2
TEST_SIZE_VALID = 0.5
EMBEDDING = False
TASK = "multiclass" #(or "binary")
column_types = common.json_load("#datasets/Colab_PowerConverter/column_types.json")
target = column_types["target"]
measurement_label = column_types["measurement_label"]
RANDOM_STATE = 1
TEST_SIZE_TRAIN = 0.2
TEST_SIZE_VALID = 0.5
EMBEDDING = False
TASK = "multiclass" #(or "binary")
In [9]:
Copied!
df = pd.read_pickle("#datasets/Colab_PowerConverter/dataset.pkl")
df = pd.read_pickle("#datasets/Colab_PowerConverter/dataset.pkl")
In [10]:
Copied!
# this measurement did not have a fault (?)
df = df[df[measurement_label]!="Single-Phase_Sensor_Fault"]
df.reset_index(inplace=True, drop=True)
# this measurement did not have a fault (?)
df = df[df[measurement_label]!="Single-Phase_Sensor_Fault"]
df.reset_index(inplace=True, drop=True)
In [11]:
Copied!
fault_dict = {}
for label,i in zip(df[measurement_label].unique(), range(len(df[measurement_label].unique()))):
df.loc[(df[measurement_label]==label) & (df[target]==1), target] = int(i+1)
fault_dict[label] = int(i+1)
fault_dict = {}
for label,i in zip(df[measurement_label].unique(), range(len(df[measurement_label].unique()))):
df.loc[(df[measurement_label]==label) & (df[target]==1), target] = int(i+1)
fault_dict[label] = int(i+1)
In [12]:
Copied!
# imbalance of the classes
df[target].value_counts()
# imbalance of the classes
df[target].value_counts()
Out[12]:
0 597599 5 40014 3 40001 6 40001 7 40001 8 40001 9 40001 10 40001 11 40001 13 40001 1 38971 2 38971 4 3166 12 1335 Name: fault, dtype: int64
In [13]:
Copied!
fault_dict
fault_dict
Out[13]:
{'Damping-320': 1,
'Damping-32000': 2,
'Inertia-1.2': 3,
'LL_Fault': 4,
'Three-Phase_Sensor_Fault': 5,
'Weak_Grid-4_5_mH': 6,
'Weak_Grid-1_5_mH': 7,
'Damping-3200': 8,
'Inertia-0.2': 9,
'Inertia-2': 10,
'Single_Phase_Sag': 11,
'Three_Phase_Grid_Fault': 12,
'Weak_Grid-7_5_mH': 13}
Preprocessing¶
In [14]:
Copied!
df.drop(columns=[measurement_label], inplace=True)
df.drop(columns=[measurement_label], inplace=True)
In [15]:
Copied!
df_train, df_valid = train_test_split(df, test_size=TEST_SIZE_TRAIN, stratify=df[target], random_state=RANDOM_STATE)
df_valid, df_test = train_test_split(df_valid, test_size=TEST_SIZE_VALID, stratify=df_valid[target], random_state=RANDOM_STATE)
df_train.reset_index(inplace=True, drop=True)
df_valid.reset_index(inplace=True, drop=True)
df_test.reset_index(inplace=True, drop=True)
df_train, df_valid = train_test_split(df, test_size=TEST_SIZE_TRAIN, stratify=df[target], random_state=RANDOM_STATE)
df_valid, df_test = train_test_split(df_valid, test_size=TEST_SIZE_VALID, stratify=df_valid[target], random_state=RANDOM_STATE)
df_train.reset_index(inplace=True, drop=True)
df_valid.reset_index(inplace=True, drop=True)
df_test.reset_index(inplace=True, drop=True)
In [16]:
Copied!
df_train_scaled, Scaler = common.scale(df_train, [target], scaler_sk='Standard')
df_valid_scaled, Scaler = common.scale(df_valid, [target], scaler_sk=Scaler)
df_test_scaled, Scaler = common.scale(df_test, [target], scaler_sk=Scaler)
df_train_scaled, Scaler = common.scale(df_train, [target], scaler_sk='Standard')
df_valid_scaled, Scaler = common.scale(df_valid, [target], scaler_sk=Scaler)
df_test_scaled, Scaler = common.scale(df_test, [target], scaler_sk=Scaler)
Categorical features tranformation¶
In [18]:
Copied!
if EMBEDDING:
CAT_FEATURE_TRANSFORMATION = "Entity Embedding"
if CAT_FEATURE_TRANSFORMATION == "Label Encoding":
label_encoder = LabelEncoder(cat_cols)
label_encoder.fit(data[cat_cols])
df_train_scaled_enc = df_train_scaled.copy()
df_valid_scaled_enc = df_valid_scaled.copy()
df_test_scaled_enc = df_test_scaled.copy()
df_train_scaled_enc[cat_cols] = label_encoder.transform(
df_train_scaled_enc[cat_cols]
)
df_valid_scaled_enc[cat_cols] = label_encoder.transform(
df_valid_scaled_enc[cat_cols]
)
df_test_scaled_enc[cat_cols] = label_encoder.transform(
df_test_scaled_enc[cat_cols]
)
df_test_scaled_enc[cat_cols].head()
if CAT_FEATURE_TRANSFORMATION == "Entity Embedding":
# using pretrained embedding from pytorch-widedeep model and its tab_preprocessor
with open("dl_entity_emb_model_" + TASK + ".dill", "rb") as f:
model = dill.load(f)
with open("dl_entity_emb_model_tab_preprocessor_" + TASK + ".dill", "rb") as f:
tab_preprocessor = dill.load(f)
t2v = Tab2Vec(model=model, tab_preprocessor=tab_preprocessor, return_dataframe=True)
df_train_scaled_enc, df_train_y = t2v.transform(
df_train_scaled, target_col=target_col
)
df_valid_scaled_enc, df_valid_y = t2v.transform(
df_valid_scaled, target_col=target_col
)
df_test_scaled_enc, df_test_y = t2v.transform(
df_test_scaled, target_col=target_col
)
df_train_scaled_enc[target_col] = df_train_y
df_valid_scaled_enc[target_col] = df_valid_y
df_test_scaled_enc[target_col] = df_test_y
cols_list = list(df_test_scaled_enc.columns)
cat_cols_emb = []
for cat_col in cat_cols:
r = re.compile(cat_col + "*")
cat_cols_emb.extend(list(filter(r.match, cols_list)))
# df_test_scaled_enc[cat_cols_emb].head()
else:
df_train_scaled_enc = df_train_scaled.copy()
df_valid_scaled_enc = df_valid_scaled.copy()
df_test_scaled_enc = df_test_scaled.copy()
if EMBEDDING:
CAT_FEATURE_TRANSFORMATION = "Entity Embedding"
if CAT_FEATURE_TRANSFORMATION == "Label Encoding":
label_encoder = LabelEncoder(cat_cols)
label_encoder.fit(data[cat_cols])
df_train_scaled_enc = df_train_scaled.copy()
df_valid_scaled_enc = df_valid_scaled.copy()
df_test_scaled_enc = df_test_scaled.copy()
df_train_scaled_enc[cat_cols] = label_encoder.transform(
df_train_scaled_enc[cat_cols]
)
df_valid_scaled_enc[cat_cols] = label_encoder.transform(
df_valid_scaled_enc[cat_cols]
)
df_test_scaled_enc[cat_cols] = label_encoder.transform(
df_test_scaled_enc[cat_cols]
)
df_test_scaled_enc[cat_cols].head()
if CAT_FEATURE_TRANSFORMATION == "Entity Embedding":
# using pretrained embedding from pytorch-widedeep model and its tab_preprocessor
with open("dl_entity_emb_model_" + TASK + ".dill", "rb") as f:
model = dill.load(f)
with open("dl_entity_emb_model_tab_preprocessor_" + TASK + ".dill", "rb") as f:
tab_preprocessor = dill.load(f)
t2v = Tab2Vec(model=model, tab_preprocessor=tab_preprocessor, return_dataframe=True)
df_train_scaled_enc, df_train_y = t2v.transform(
df_train_scaled, target_col=target_col
)
df_valid_scaled_enc, df_valid_y = t2v.transform(
df_valid_scaled, target_col=target_col
)
df_test_scaled_enc, df_test_y = t2v.transform(
df_test_scaled, target_col=target_col
)
df_train_scaled_enc[target_col] = df_train_y
df_valid_scaled_enc[target_col] = df_valid_y
df_test_scaled_enc[target_col] = df_test_y
cols_list = list(df_test_scaled_enc.columns)
cat_cols_emb = []
for cat_col in cat_cols:
r = re.compile(cat_col + "*")
cat_cols_emb.extend(list(filter(r.match, cols_list)))
# df_test_scaled_enc[cat_cols_emb].head()
else:
df_train_scaled_enc = df_train_scaled.copy()
df_valid_scaled_enc = df_valid_scaled.copy()
df_test_scaled_enc = df_test_scaled.copy()
LightGBM¶
In [19]:
Copied!
# df_train_scaled_enc = df_train_scaled_enc.sample(100000)
# df_valid_scaled_enc = df_valid_scaled_enc.sample(30000)
# df_train_scaled_enc = df_train_scaled_enc.sample(100000)
# df_valid_scaled_enc = df_valid_scaled_enc.sample(30000)
In [21]:
Copied!
NUM_CLASSES = df[target].nunique()
NUM_CLASSES
NUM_CLASSES = df[target].nunique()
NUM_CLASSES
Out[21]:
14
Prepare Dataset, metric and objective functions¶
In [51]:
Copied!
config = {}
if TASK == "binary" or TASK == "multiclass":
config["objective"] = TASK
config["num_classes"] = NUM_CLASSES
if TASK == "multiclass":
ray_metric = "multi_logloss"
if EMBEDDING:
if CAT_FEATURE_TRANSFORMATION == "Label Encoding":
lgb_cat_cols = cat_cols_f
if CAT_FEATURE_TRANSFORMATION == "Entity Embedding":
lgb_cat_cols = []
else:
lgb_cat_cols = []
lgbtrain = lgbm.Dataset(
df_train_scaled_enc.drop(columns=[target]),
df_train_scaled_enc[target],
categorical_feature=lgb_cat_cols,
free_raw_data=False,
)
lgbvalid = lgbm.Dataset(
df_valid_scaled_enc.drop(columns=[target]),
df_valid_scaled_enc[target],
reference=lgbtrain,
free_raw_data=False,
)
# Final TRAIN/TEST
ftrain = pd.concat([df_train_scaled_enc, df_valid_scaled_enc]).reset_index(
drop=True
)
flgbtrain = lgbm.Dataset(
ftrain.drop(columns=[target]),
ftrain[target],
categorical_feature=lgb_cat_cols,
free_raw_data=False,
)
lgbtest = lgbm.Dataset(
df_test_scaled_enc.drop(columns=[target]),
df_test_scaled_enc[target],
categorical_feature=lgb_cat_cols,
reference=flgbtrain,
free_raw_data=False,
)
config = {}
if TASK == "binary" or TASK == "multiclass":
config["objective"] = TASK
config["num_classes"] = NUM_CLASSES
if TASK == "multiclass":
ray_metric = "multi_logloss"
if EMBEDDING:
if CAT_FEATURE_TRANSFORMATION == "Label Encoding":
lgb_cat_cols = cat_cols_f
if CAT_FEATURE_TRANSFORMATION == "Entity Embedding":
lgb_cat_cols = []
else:
lgb_cat_cols = []
lgbtrain = lgbm.Dataset(
df_train_scaled_enc.drop(columns=[target]),
df_train_scaled_enc[target],
categorical_feature=lgb_cat_cols,
free_raw_data=False,
)
lgbvalid = lgbm.Dataset(
df_valid_scaled_enc.drop(columns=[target]),
df_valid_scaled_enc[target],
reference=lgbtrain,
free_raw_data=False,
)
# Final TRAIN/TEST
ftrain = pd.concat([df_train_scaled_enc, df_valid_scaled_enc]).reset_index(
drop=True
)
flgbtrain = lgbm.Dataset(
ftrain.drop(columns=[target]),
ftrain[target],
categorical_feature=lgb_cat_cols,
free_raw_data=False,
)
lgbtest = lgbm.Dataset(
df_test_scaled_enc.drop(columns=[target]),
df_test_scaled_enc[target],
categorical_feature=lgb_cat_cols,
reference=flgbtrain,
free_raw_data=False,
)
Train model¶
In [26]:
Copied!
%%time
model = lgbm.train(
config,
flgbtrain,
valid_sets=[lgbvalid],
valid_names=[""],
#feval=feval,
#fobj=fobj,
#callbacks=[log_evaluation()],
)
%%time
model = lgbm.train(
config,
flgbtrain,
valid_sets=[lgbvalid],
valid_names=[""],
#feval=feval,
#fobj=fobj,
#callbacks=[log_evaluation()],
)
/home/palo/miniconda3/lib/python3.8/site-packages/lightgbm/basic.py:2065: UserWarning: Using categorical_feature in Dataset.
_log_warning('Using categorical_feature in Dataset.')
/home/palo/miniconda3/lib/python3.8/site-packages/lightgbm/basic.py:2068: UserWarning: categorical_feature in Dataset is overridden.
New categorical_feature is []
_log_warning('categorical_feature in Dataset is overridden.\n'
[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007216 seconds. You can set `force_row_wise=true` to remove the overhead. And if memory is not enough, you can set `force_col_wise=true`. [LightGBM] [Info] Total Bins 2295 [LightGBM] [Info] Number of data points in the train set: 936057, number of used features: 9 [LightGBM] [Info] Start training from score -0.554117 [LightGBM] [Info] Start training from score -3.284216 [LightGBM] [Info] Start training from score -3.284216 [LightGBM] [Info] Start training from score -3.258130 [LightGBM] [Info] Start training from score -5.794357 [LightGBM] [Info] Start training from score -3.257824 [LightGBM] [Info] Start training from score -3.258130 [LightGBM] [Info] Start training from score -3.258130 [LightGBM] [Info] Start training from score -3.258157 [LightGBM] [Info] Start training from score -3.258157 [LightGBM] [Info] Start training from score -3.258130 [LightGBM] [Info] Start training from score -3.258130 [LightGBM] [Info] Start training from score -6.657690 [LightGBM] [Info] Start training from score -3.258130 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [1] 's multi_logloss: 0.977783 [2] 's multi_logloss: 0.815664 [3] 's multi_logloss: 0.698939 [4] 's multi_logloss: 0.625425 [5] 's multi_logloss: 0.543057 [6] 's multi_logloss: 0.484832 [7] 's multi_logloss: 0.446497 [8] 's multi_logloss: 0.39927 [9] 's multi_logloss: 0.367568 [10] 's multi_logloss: 0.340573 [11] 's multi_logloss: 0.319405 [12] 's multi_logloss: 0.296019 [13] 's multi_logloss: 0.279713 [14] 's multi_logloss: 0.266099 [15] 's multi_logloss: 0.267354 [16] 's multi_logloss: 0.255597 [17] 's multi_logloss: 0.252282 [18] 's multi_logloss: 0.235086 [19] 's multi_logloss: 0.237021 [20] 's multi_logloss: 0.230893 [21] 's multi_logloss: 0.235814 [22] 's multi_logloss: 0.229844 [23] 's multi_logloss: 0.253783 [24] 's multi_logloss: 0.243563 [25] 's multi_logloss: 0.245112 [26] 's multi_logloss: 0.251267 [27] 's multi_logloss: 0.237238 [28] 's multi_logloss: 0.252955 [29] 's multi_logloss: 0.233645 [30] 's multi_logloss: 0.25831 [31] 's multi_logloss: 0.298165 [32] 's multi_logloss: 0.295286 [33] 's multi_logloss: 0.27396 [34] 's multi_logloss: 0.260525 [35] 's multi_logloss: 0.292285 [36] 's multi_logloss: 0.376081 [37] 's multi_logloss: 0.330623 [38] 's multi_logloss: 0.442323 [39] 's multi_logloss: 0.430259 [40] 's multi_logloss: 0.390212 [41] 's multi_logloss: 0.466535 [42] 's multi_logloss: 0.564834 [43] 's multi_logloss: 0.696845 [44] 's multi_logloss: 0.49243 [45] 's multi_logloss: 0.600275 [46] 's multi_logloss: 0.598877 [47] 's multi_logloss: 0.446685 [48] 's multi_logloss: 0.490207 [49] 's multi_logloss: 0.413613 [50] 's multi_logloss: 0.4442 [51] 's multi_logloss: 0.523439 [52] 's multi_logloss: 0.519875 [53] 's multi_logloss: 0.632309 [54] 's multi_logloss: 0.672342 [55] 's multi_logloss: 0.602289 [56] 's multi_logloss: 0.564768 [57] 's multi_logloss: 0.529735 [58] 's multi_logloss: 0.689203 [59] 's multi_logloss: 2.90619 [60] 's multi_logloss: 0.622281 [61] 's multi_logloss: 0.841785 [62] 's multi_logloss: 0.72034 [63] 's multi_logloss: 0.759707 [64] 's multi_logloss: 0.840372 [65] 's multi_logloss: 0.748165 [66] 's multi_logloss: 0.82118 [67] 's multi_logloss: 1.46753 [68] 's multi_logloss: 1.55493 [69] 's multi_logloss: 1.73801 [70] 's multi_logloss: 1.40299 [71] 's multi_logloss: 3.806 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [72] 's multi_logloss: 2.32776 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [73] 's multi_logloss: 2.76761 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [74] 's multi_logloss: 1.50927 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [75] 's multi_logloss: 1.56427 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [76] 's multi_logloss: 1.5167 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [77] 's multi_logloss: 1.4302 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [78] 's multi_logloss: 1.52527 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [79] 's multi_logloss: 1.54124 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [80] 's multi_logloss: 1.48194 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [81] 's multi_logloss: 1.98842 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [82] 's multi_logloss: 2.02313 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [83] 's multi_logloss: 3.80615 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [84] 's multi_logloss: 2.92285 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [85] 's multi_logloss: 2.73017 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [86] 's multi_logloss: 2.81455 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [87] 's multi_logloss: 2.67801 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [88] 's multi_logloss: 3.59612 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [89] 's multi_logloss: 2.67073 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [90] 's multi_logloss: 2.70365 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [91] 's multi_logloss: 2.6407 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [92] 's multi_logloss: 2.73322 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [93] 's multi_logloss: 2.74708 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [94] 's multi_logloss: 3.23696 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [95] 's multi_logloss: 3.33012 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [96] 's multi_logloss: 3.47458 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [97] 's multi_logloss: 3.88501 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [98] 's multi_logloss: 4.22404 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [99] 's multi_logloss: 4.21596 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [100] 's multi_logloss: 4.65451 CPU times: user 5min 27s, sys: 3.14 s, total: 5min 30s Wall time: 42.4 s
Prediction & Evaluation¶
In [49]:
Copied!
if TASK == "binary":
res = np.rint(model.predict(lgbtest.data))
if TASK == "multiclass":
res = model.predict(lgbtest.data).argmax(1)
result = pd.DataFrame({"predicted": res,
"ground_truth": df_test[target].values,})
if TASK == "binary":
res = np.rint(model.predict(lgbtest.data))
if TASK == "multiclass":
res = model.predict(lgbtest.data).argmax(1)
result = pd.DataFrame({"predicted": res,
"ground_truth": df_test[target].values,})
In [50]:
Copied!
print('Classification report:\n{}'.format(classification_report(result['predicted'], result['ground_truth'])))
print('Classification report:\n{}'.format(classification_report(result['predicted'], result['ground_truth'])))
Classification report:
precision recall f1-score support
0 0.99 0.98 0.99 60371
1 0.71 0.75 0.73 3681
2 0.71 0.77 0.74 3597
3 0.34 0.40 0.37 3457
4 0.00 0.00 0.00 19
5 0.95 0.96 0.95 3959
6 0.88 0.51 0.65 6840
7 0.69 0.83 0.75 3318
8 0.35 0.34 0.34 4137
9 0.33 0.37 0.35 3511
10 0.37 0.40 0.38 3736
11 0.84 0.90 0.87 3753
12 0.12 0.15 0.13 109
13 0.79 0.90 0.84 3519
accuracy 0.84 104007
macro avg 0.58 0.59 0.58 104007
weighted avg 0.85 0.84 0.84 104007
w RayTune¶
In [54]:
Copied!
start = time()
#config["eta"] = tune.loguniform(1e-4, 1e-1),
#config["subsample"] = tune.uniform(0.5, 1.0),
config["max_depth"] = tune.randint(1, 9),
# config["wandb"]["project"] = "GBM_classifier",
# config["wandb"]["api_key_file"] = "../data/wandb_api.key",
# config["wandb"]["log_config"] = True
def training_function(config, train, valid):
lgbm_config = config.copy()
#lgbm_config.pop("wandb")
trainer = lgbm.train(
lgbm_config,
train,
valid_sets=[valid],
valid_names=[""],
callbacks=[
TuneReportCheckpointCallback(
{
ray_metric: ray_metric,
}
)
],
)
asha_scheduler = AsyncHyperBandScheduler(
time_attr="training_iteration",
metric=ray_metric,
mode="min",
max_t=100,
grace_period=10,
reduction_factor=3,
brackets=1,
)
analysis = tune.run(
tune.with_parameters(training_function, train=lgbtrain, valid=lgbvalid),
# resources_per_trial={"cpu": 4, "gpu": 0},
num_samples=2,
progress_reporter=JupyterNotebookReporter(overwrite=True),
scheduler=asha_scheduler,
config=config,
#loggers=DEFAULT_LOGGERS + (WandbLogger,),
)
start = time()
#config["eta"] = tune.loguniform(1e-4, 1e-1),
#config["subsample"] = tune.uniform(0.5, 1.0),
config["max_depth"] = tune.randint(1, 9),
# config["wandb"]["project"] = "GBM_classifier",
# config["wandb"]["api_key_file"] = "../data/wandb_api.key",
# config["wandb"]["log_config"] = True
def training_function(config, train, valid):
lgbm_config = config.copy()
#lgbm_config.pop("wandb")
trainer = lgbm.train(
lgbm_config,
train,
valid_sets=[valid],
valid_names=[""],
callbacks=[
TuneReportCheckpointCallback(
{
ray_metric: ray_metric,
}
)
],
)
asha_scheduler = AsyncHyperBandScheduler(
time_attr="training_iteration",
metric=ray_metric,
mode="min",
max_t=100,
grace_period=10,
reduction_factor=3,
brackets=1,
)
analysis = tune.run(
tune.with_parameters(training_function, train=lgbtrain, valid=lgbvalid),
# resources_per_trial={"cpu": 4, "gpu": 0},
num_samples=2,
progress_reporter=JupyterNotebookReporter(overwrite=True),
scheduler=asha_scheduler,
config=config,
#loggers=DEFAULT_LOGGERS + (WandbLogger,),
)
== Status ==
Current time: 2021-11-08 10:38:03 (running for 00:00:01.22)
Memory usage on this node: 3.0/12.2 GiB
Using AsyncHyperBand: num_stopped=0 Bracket: Iter 90.000: None | Iter 30.000: None | Iter 10.000: None
Resources requested: 0/8 CPUs, 0/0 GPUs, 0.0/6.47 GiB heap, 0.0/3.24 GiB objects
Result logdir: /home/palo/ray_results/training_function_2021-11-08_10-38-02
Number of trials: 2/2 (2 ERROR)
Number of errored trials: 2
Current time: 2021-11-08 10:38:03 (running for 00:00:01.22)
Memory usage on this node: 3.0/12.2 GiB
Using AsyncHyperBand: num_stopped=0 Bracket: Iter 90.000: None | Iter 30.000: None | Iter 10.000: None
Resources requested: 0/8 CPUs, 0/0 GPUs, 0.0/6.47 GiB heap, 0.0/3.24 GiB objects
Result logdir: /home/palo/ray_results/training_function_2021-11-08_10-38-02
Number of trials: 2/2 (2 ERROR)
| Trial name | status | loc |
|---|---|---|
| training_function_91209_00000 | ERROR | 172.18.71.208:626 |
| training_function_91209_00001 | ERROR | 172.18.71.208:624 |
Number of errored trials: 2
| Trial name | # failures | error file |
|---|---|---|
| training_function_91209_00000 | 1 | /home/palo/ray_results/training_function_2021-11-08_10-38-02/training_function_91209_00000_0_2021-11-08_10-38-02/error.txt |
| training_function_91209_00001 | 1 | /home/palo/ray_results/training_function_2021-11-08_10-38-02/training_function_91209_00001_1_2021-11-08_10-38-02/error.txt |
--------------------------------------------------------------------------- TuneError Traceback (most recent call last) /tmp/ipykernel_409/1918576441.py in <module> 37 ) 38 ---> 39 analysis = tune.run( 40 tune.with_parameters(training_function, train=lgbtrain, valid=lgbvalid), 41 # resources_per_trial={"cpu": 4, "gpu": 0}, ~/miniconda3/lib/python3.8/site-packages/ray/tune/tune.py in run(run_or_experiment, name, metric, mode, stop, time_budget_s, config, resources_per_trial, num_samples, local_dir, search_alg, scheduler, keep_checkpoints_num, checkpoint_score_attr, checkpoint_freq, checkpoint_at_end, verbose, progress_reporter, log_to_file, trial_name_creator, trial_dirname_creator, sync_config, export_formats, max_failures, fail_fast, restore, server_port, resume, queue_trials, reuse_actors, trial_executor, raise_on_failed_trial, callbacks, max_concurrent_trials, loggers, _remote) 622 if incomplete_trials: 623 if raise_on_failed_trial and not state[signal.SIGINT]: --> 624 raise TuneError("Trials did not complete", incomplete_trials) 625 else: 626 logger.error("Trials did not complete: %s", incomplete_trials) TuneError: ('Trials did not complete', [training_function_91209_00000, training_function_91209_00001])
In [40]:
Copied!
analysis.trial_dataframes
analysis.trial_dataframes
Train best params model¶
In [ ]:
Copied!
runtime = time() - start
print("Optimization time:\n{}".format(runtime))
params = copy(analysis.get_best_config(ray_metric, "min"))
params.pop("wandb")
# params["n_estimators"] = 1000
start = time()
model = lgbm.train(
params,
flgbtrain,
valid_sets=[lgbtest],
callbacks=[lgbm.log_evaluation(show_stdv=False)],
)
runtime = time() - start
print("Final model training time:\n{}".format(str(datetime.timedelta(seconds=runtime))))a
runtime = time() - start
print("Optimization time:\n{}".format(runtime))
params = copy(analysis.get_best_config(ray_metric, "min"))
params.pop("wandb")
# params["n_estimators"] = 1000
start = time()
model = lgbm.train(
params,
flgbtrain,
valid_sets=[lgbtest],
callbacks=[lgbm.log_evaluation(show_stdv=False)],
)
runtime = time() - start
print("Final model training time:\n{}".format(str(datetime.timedelta(seconds=runtime))))a
Tensorboard visualization¶
In [ ]:
Copied!
from tensorboard import notebook
notebook.list()
from tensorboard import notebook
notebook.list()
In [ ]:
Copied!
%load_ext tensorboard
%tensorboard --logdir ~/ray_results
%load_ext tensorboard
%tensorboard --logdir ~/ray_results