faster-lgbm-predictor: A Python repository from zhulei227

0.介绍

加速lgbm对单条数据的预测，提速X4倍左右

import pandas as pd
import numpy as np
import lightgbm as lgb
from faster_lgbm_predictor_single import FasterLgbmSinglePredictor
from faster_lgbm_predictor_multiclass import FasterLgbmMulticlassPredictor

1.加载数据

df=pd.read_csv("./data/train.csv")

del df["PassengerId"]
del df["Name"]
del df["Sex"]
del df["Ticket"]
del df["Cabin"]
del df["Embarked"]

df=df.fillna(0)
df.head(5)

	Survived	Pclass	Age	SibSp	Fare
0	0	3	22.0	1	7.2500
1	1	1	38.0	1	71.2833
2	1	3	26.0	0	7.9250
3	1	1	35.0	1	53.1000
4	0	3	35.0	0	8.0500

target=df["Survived"].values
del df["Survived"]

categorical_features=["Pclass","SibSp","Parch"]

2.1 二分类测试

params={"objective":"binary","max_depth":2}
lgb_model=lgb.train(params=params,train_set=lgb.Dataset(data=df,label=target,categorical_feature=categorical_features),num_boost_round=16)

[LightGBM] [Info] Number of positive: 342, number of negative: 549
[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000900 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 216
[LightGBM] [Info] Number of data points in the train set: 891, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383838 -> initscore=-0.473288
[LightGBM] [Info] Start training from score -0.473288


D:\apps\Anaconda3\envs\autogluon\lib\site-packages\lightgbm\basic.py:2065: UserWarning: Using categorical_feature in Dataset.
  _log_warning('Using categorical_feature in Dataset.')

faster_lgbm_predictor=FasterLgbmSinglePredictor(model=lgb_model.dump_model(),cache_num=10)

ori_pred=lgb_model.predict(df)

fast_pred=[]
for input_data in df.to_dict("records"):
    fast_pred.append(faster_lgbm_predictor.predict(input_data).get("score"))
fast_pred=np.asarray(fast_pred)

np.sum(np.abs(fast_pred-ori_pred))

0.0

2.2 多分类测试

params={"objective":"multiclass","max_depth":2,"num_class":2}
lgb_model=lgb.train(params=params,train_set=lgb.Dataset(data=df,label=target,categorical_feature=categorical_features),num_boost_round=16)

[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000781 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 216
[LightGBM] [Info] Number of data points in the train set: 891, number of used features: 5
[LightGBM] [Info] Start training from score -0.484246
[LightGBM] [Info] Start training from score -0.957534

ori_pred=pd.DataFrame(lgb_model.predict(df))

faster_lgbm_predictor=FasterLgbmMulticlassPredictor(model=lgb_model.dump_model(),cache_num=10)

fast_pred=[]
for input_data in df.to_dict("records"):
    fast_pred.append(faster_lgbm_predictor.predict(input_data).get("score"))
fast_pred=pd.DataFrame(fast_pred)

error_value=0
for col in fast_pred.columns:
    error_value+=np.sum(np.abs(fast_pred[col]-ori_pred[col]))
error_value

8.013034680232067e-14

2.3 回归测试

params={"objective":"regression","max_depth":2}
lgb_model=lgb.train(params=params,train_set=lgb.Dataset(data=df,label=target,categorical_feature=categorical_features),num_boost_round=16)

[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015627 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 216
[LightGBM] [Info] Number of data points in the train set: 891, number of used features: 5
[LightGBM] [Info] Start training from score 0.383838

faster_lgbm_predictor=FasterLgbmSinglePredictor(model=lgb_model.dump_model(),cache_num=10)

ori_pred=lgb_model.predict(df)

fast_pred=[]
for input_data in df.to_dict("records"):
    fast_pred.append(faster_lgbm_predictor.predict(input_data).get("score"))
fast_pred=np.asarray(fast_pred)

np.sum(np.abs(fast_pred-ori_pred))

0.0

2.4 指数分布回归测试¶

params={"objective":"tweedie","max_depth":2}
lgb_model=lgb.train(params=params,train_set=lgb.Dataset(df,target,categorical_feature=categorical_features),num_boost_round=16)

[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012412 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 216
[LightGBM] [Info] Number of data points in the train set: 891, number of used features: 5
[LightGBM] [Info] Start training from score -0.957534

faster_lgbm_predictor=FasterLgbmSinglePredictor(model=lgb_model.dump_model(),cache_num=10)

ori_pred=lgb_model.predict(df)

fast_pred=[]
for input_data in df.to_dict("records"):
    fast_pred.append(faster_lgbm_predictor.predict(input_data).get("score"))
fast_pred=np.asarray(fast_pred)

np.sum(np.abs(fast_pred-ori_pred))

0.0

性能对比

from tqdm import tqdm

new_data=df.to_dict("records")
for data in tqdm(new_data):
    faster_lgbm_predictor.predict(data)

100%|███████████████████████████████████████████████████████████████████████████████| 891/891 [00:02<00:00, 425.57it/s]

new_data=[]
for i in list(range(len(df))):
    new_data.append(df[i:i+1])
for data in tqdm(new_data):
    lgb_model.predict(data)

100%|████████████████████████████████████████████████████████████████████████████████| 891/891 [00:10<00:00, 88.66it/s]

zhulei227/faster-lgbm-predictor

0.介绍

1.加载数据

2.1 二分类测试

2.2 多分类测试

2.3 回归测试

2.4 指数分布回归测试¶

性能对比