/wgcpy

Data analysis and PMML model framework

Primary LanguagePythonMIT LicenseMIT

wgcpy 包名

Data analysis and PMML model framework

version 1.0.0

Different modules of the Package are provided for everyone to use.

  • data detect
  • variable eda
  • the way of cut variable bins
  • calculate iv or psi
  • auto feature selector
  • generate PMML model

Require

  • python 3.5 or newer
  • Java 1.8 or newer. The java executable must be available on system path.

Install

GitLab安装

pip install --upgrade https://github.com/whyjust/wgcpy

Structure

wgcpy package tree structure:

WGCPY
D:\GITHUB\WGCPY
│  .gitignore
│  info.log
│  LICENSE
│  main.py
│  MANIFEST.in
│  README.md
│  requirements.txt
│  setup.py
│
├─data
│
├─pic
│
├─result
│
└─wgcpy
    │  config.py
    │  __init__.py
    │
    ├─bins
    │      chi_merge.py
    │      cut_bins.py
    │      __init__.py
    │
    ├─featureSelector
    │      cal_iv_psi.py
    │      cal_iv_psi_special.py
    │      selector.py
    │      __init__.py
    │
    ├─model
    │      dz_eval.py
    │      gen_model.py
    │      gen_pmml_model.py
    │      __init__.py
    │
    ├─preprocessing
    │      baggingPU.py
    │      data_detection.py
    │      eda.py
    │      __init__.py
    │
    └─utils
            ext_fn.py
            __init__.py

Usage

1 main.py运行
python main.py
2 数据EDA模块
plot_feature_boxplot(credit_data, numeric_feats)
plot_feature_distribution(credit_data, numeric_feats,
                          label="flag", sub_col=3)
plot_category_countplot(credit_data, category_feats, label="flag",
                        sub_col=5, figsize=(20,12))
plot_corr(credit_data, numeric_feats+['flag'], mask=True)
3 数据探查
# 数据分布
with timer('detect dataframe'):
    dec = DetectDF(credit_data)
    df_des = dec.detect(special_value_dict={-999:np.nan},
                        output=os.path.join(base_dir, "result"))
Pubagging
with timer('pu bagging'):
    estimator = LGBMClassifier(n_estimators=200, max_depth=2, learning_rate=0.1)
    bc = BaggingClassifierPU(base_estimator=estimator, 
                            n_estimators = 30, 
                            n_jobs = -1, 
                            max_samples = len(credit_data[credit_data['flag']==1]))
    bc.fit(credit_data[numeric_feats], credit_data['flag'])
    score_arr = bc.oob_decision_function_[:,1]
    credit_data['score_pb'] = score_arr
    credit_data = credit_data[(credit_data['score_pb'].isna()) | (credit_data['score_pb']<0.9)]
    print('PUbagging-shape:', credit_data.shape)
4 计算IV
with timer("cal iv"):
    iv_details = cal_total_var_iv(credit_data,
                                  numeric_feats=numeric_feats,
                                  category_feats=category_feats,
                                  target='flag',
                                  max_interval=10,
                                  method='tree')
    fig = plot_bin_woe(binx=iv_details[iv_details['variable'] == 'credit.amount'],
                       title=None,
                       display_iv=True)
    iv_details.to_csv(os.path.join(base_dir,r'result\iv_details.csv'), index=False)
5 计算PSI
with timer('cal psi'):
    except_array = credit_data['credit.amount'][:500]
    accept_arry = credit_data['credit.amount'][500:]
    psi_df = numeric_var_cal_psi(except_array,
                                 accept_arry,
                                 bins=10,
                                 bucket_type='bins',
                                 detail=True)
    psi_df.to_csv(os.path.join(base_dir, r'result\psi.csv'))
6 特征初筛与细筛
with timer("cal cv score"):
    groups = credit_data['status.of.existing.checking.account']
    config = {
        "na_threshold": 0.95,
        "correlation_threshold": 0.6,
        "importance_cumsum_threshold": 0.95,
        "params": {
            "n_estimators": 200,
            "max_depth": 2,
            "learning_rate": 0.1,
            "boosting_type": "gbdt",
            "importance_type": "gain",
            "n_jobs": -1
        },
        "kfold": "StratifiedKFold",
        "groups": None,
        "categorical_feature": category_feats,
        "n_splits": 5,
        "incre_params": None,
        "total_iter": 20,
        "step": 1,
        "auc_interval": None
    }
    fs = FeatureSelector(data=credit_data,
                         target='flag',
                         base_features=numeric_feats+category_feats)
    fs.identify_all(config=config)
    fs.plot_feature_importance()
    fs.result_save(output=os.path.join(base_dir, r".\result\feats_seletor_result.xlsx"))
7 PMML建模与评估
with timer("PMML model build"):
        trn_x, tes_x, y_trn, y_tes = train_test_split(credit_data,
                                                      credit_data['flag'],
                                                      test_size=0.2)
        base_feature = numeric_feats+category_feats
        cat_indices = [index for index,v in enumerate(base_feature) if v in category_feats]
        fit_params = {"classifier__categorical_feature" : cat_indices}
        print('cat_indices:', cat_indices)

        pmml_model = genPMMLModel(data=trn_x,
                                  target="flag",
                                  base_features=base_feature)

        pmml_model.make_pipeline_model(numeric_feature=numeric_feats,
                                       category_feature=category_feats,
                                       model_type='lgb',
                                       param_dict=config['params'],
                                       fit_params=fit_params)

        predict = pmml_model.evaluate(data=tes_x,
                                      target="flag")
        pmml_model.persist(base_dir="result",
                           model_name="credit")

Let's started! Welcome to star!