Cannot run identify_zero_importance on small data
bwang482 opened this issue · 0 comments
bwang482 commented
I have 32 samples and it seems feature-selector doesn't allow small data. I have edited line 295
in feature_selector.py
but still getting the same error.
if task == 'classification':
model = lgb.LGBMClassifier(min_child_samples=1, min_data_in_bin=1, n_estimators=1000, learning_rate = 0.05, verbose = -1)
Error message I keep getting:
Training Gradient Boosting Model
---------------------------------------------------------------------------
LightGBMError Traceback (most recent call last)
<ipython-input-5-47b00f66835b> in <module>
1 fs.identify_zero_importance(task = 'classification', eval_metric = 'auc',
----> 2 n_iterations = 10, early_stopping = True)
/mnt/sdb/env1/lib/python3.6/site-packages/feature_selector/feature_selector.py in identify_zero_importance(self, task, eval_metric, n_iterations, early_stopping)
309 model.fit(train_features, train_labels, eval_metric = eval_metric,
310 eval_set = [(valid_features, valid_labels)],
--> 311 early_stopping_rounds = 100, verbose = -1)
312
313 # Clean up memory
/mnt/sdb/env1/lib/python3.6/site-packages/lightgbm/sklearn.py in fit(self, X, y, sample_weight, init_score, eval_set, eval_names, eval_sample_weight, eval_class_weight, eval_init_score, eval_metric, early_stopping_rounds, verbose, feature_name, categorical_feature, callbacks)
673 verbose=verbose, feature_name=feature_name,
674 categorical_feature=categorical_feature,
--> 675 callbacks=callbacks)
676 return self
677
/mnt/sdb/env1/lib/python3.6/site-packages/lightgbm/sklearn.py in fit(self, X, y, sample_weight, init_score, group, eval_set, eval_names, eval_sample_weight, eval_class_weight, eval_init_score, eval_group, eval_metric, early_stopping_rounds, verbose, feature_name, categorical_feature, callbacks)
467 verbose_eval=verbose, feature_name=feature_name,
468 categorical_feature=categorical_feature,
--> 469 callbacks=callbacks)
470
471 if evals_result:
/mnt/sdb/env1/lib/python3.6/site-packages/lightgbm/engine.py in train(params, train_set, num_boost_round, valid_sets, valid_names, fobj, feval, init_model, feature_name, categorical_feature, early_stopping_rounds, evals_result, verbose_eval, learning_rates, keep_training_booster, callbacks)
178 # construct booster
179 try:
--> 180 booster = Booster(params=params, train_set=train_set)
181 if is_valid_contain_train:
182 booster.set_train_data_name(train_data_name)
/mnt/sdb/env1/lib/python3.6/site-packages/lightgbm/basic.py in __init__(self, params, train_set, model_file, silent)
1301 self.handle = ctypes.c_void_p()
1302 _safe_call(_LIB.LGBM_BoosterCreate(
-> 1303 train_set.construct().handle,
1304 c_str(params_str),
1305 ctypes.byref(self.handle)))
/mnt/sdb/env1/lib/python3.6/site-packages/lightgbm/basic.py in construct(self)
854 weight=self.weight, group=self.group, init_score=self.init_score,
855 predictor=self._predictor, silent=self.silent, feature_name=self.feature_name,
--> 856 categorical_feature=self.categorical_feature, params=self.params)
857 if self.free_raw_data:
858 self.data = None
/mnt/sdb/env1/lib/python3.6/site-packages/lightgbm/basic.py in _lazy_init(self, data, label, reference, weight, group, init_score, predictor, silent, feature_name, categorical_feature, params)
708 self.__init_from_csc(data, params_str, ref_dataset)
709 elif isinstance(data, np.ndarray):
--> 710 self.__init_from_np2d(data, params_str, ref_dataset)
711 else:
712 try:
/mnt/sdb/env1/lib/python3.6/site-packages/lightgbm/basic.py in __init_from_np2d(self, mat, params_str, ref_dataset)
770 c_str(params_str),
771 ref_dataset,
--> 772 ctypes.byref(self.handle)))
773
774 def __init_from_csr(self, csr, params_str, ref_dataset):
/mnt/sdb/env1/lib/python3.6/site-packages/lightgbm/basic.py in _safe_call(ret)
46 """
47 if ret != 0:
---> 48 raise LightGBMError(_LIB.LGBM_GetLastError())
49
50
LightGBMError: b'Cannot construct Dataset since there are not useful features. It should be at least two unique rows. If the num_row (num_data) is small, you can set min_data=1 and min_data_in_bin=1 to fix this. Otherwise please make sure you are using the right dataset.'