Cannot support feature name with surfix 'f' with number.

Question

Cannot support feature name with surfix 'f' with number.

Kennylovetheworld opened this issue 4 years ago · 2 comments

I just find a bug when using nyoka to output pmml file with lighGBM model. This package fail to process the data with column name like 'f1', 'f2', 'f3'. This is probably caused by function "replace_name_with_derivedColumnNames(original_name, derived_col_names)" in xgboost_to_pmml.py file. Now my solution is to rename the input column name with some other prefix to by-pass this problem. Please fix this.
`---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
in
----> 1 lgb_to_pmml(pipeline_obj, features, target, "gd_lgbm.pmml")

~/anaconda3/envs/galaxy_risk/lib/python3.6/site-packages/nyoka/lgbm/lgb_to_pmml.py in lgb_to_pmml(pipeline, col_names, target_name, pmml_f_name, model_name, description)
62 mining_imp_val,
63 categoric_values,
---> 64 model_name)
65 pmml = pml.PMML(
66 version=PMML_SCHEMA.VERSION.value,

~/anaconda3/envs/galaxy_risk/lib/python3.6/site-packages/nyoka/lgbm/lgb_to_pmml.py in get_PMML_kwargs(model, derived_col_names, col_names, target_name, mining_imp_val, categoric_values, model_name)
105 mining_imp_val,
106 categoric_values,
--> 107 model_name)}
108 return algo_kwargs
109

~/anaconda3/envs/galaxy_risk/lib/python3.6/site-packages/nyoka/lgbm/lgb_to_pmml.py in get_ensemble_models(model, derived_col_names, col_names, target_name, mining_imp_val, categoric_values, model_name)
137 mining_models.append(pml.MiningModel(
138 modelName=model_name if model_name else "LightGBModel",
--> 139 Segmentation=get_outer_segmentation(model, derived_col_names, col_names, target_name, mining_imp_val,categoric_values,model_name),
140 **model_kwargs
141 ))

~/anaconda3/envs/galaxy_risk/lib/python3.6/site-packages/nyoka/lgbm/lgb_to_pmml.py in get_outer_segmentation(model, derived_col_names, col_names, target_name, mining_imp_val, categoric_values, model_name)
177 segmentation = pml.Segmentation(
178 multipleModelMethod=get_multiple_model_method(model),
--> 179 Segment=get_segments(model, derived_col_names, col_names, target_name, mining_imp_val,categoric_values,model_name)
180 )
181 return segmentation

~/anaconda3/envs/galaxy_risk/lib/python3.6/site-packages/nyoka/lgbm/lgb_to_pmml.py in get_segments(model, derived_col_names, col_names, target_name, mining_imp_val, categoric_values, model_name)
210 segments = None
211 if 'LGBMClassifier' in str(model.class):
--> 212 segments=get_segments_for_lgbc(model, derived_col_names, col_names, target_name, mining_imp_val,categoric_values,model_name)
213 elif 'LGBMRegressor' in str(model.class):
214 segments=get_segments_for_lgbr(model, derived_col_names, col_names, target_name, mining_imp_val,categoric_values)

~/anaconda3/envs/galaxy_risk/lib/python3.6/site-packages/nyoka/lgbm/lgb_to_pmml.py in get_segments_for_lgbc(model, derived_col_names, feature_names, target_name, mining_imp_val, categoric_values, model_name)
372 oField.append("lgbValue")
373 segments_equal_to_estimators = generate_Segments_Equal_To_Estimators(main_key_value, derived_col_names,
--> 374 feature_names)
375 First_segment = xgboostToPmml.add_segmentation(model,segments_equal_to_estimators, mining_schema_for_1st_segment, out, 1)
376 reg_model = sklToPmml.get_regrs_models(model, oField, oField, target_name, mining_imp_val, categoric_values,model_name)[0]

~/anaconda3/envs/galaxy_risk/lib/python3.6/site-packages/nyoka/lgbm/lgb_to_pmml.py in generate_Segments_Equal_To_Estimators(val, derived_col_names, col_names)
237 mining_field_for_innner_segments = col_names
238 m_flds = []
--> 239 create_node(val[i], main_node, derived_col_names)
240 for name in mining_field_for_innner_segments:
241 m_flds.append(pml.MiningField(name=name))

~/anaconda3/envs/galaxy_risk/lib/python3.6/site-packages/nyoka/lgbm/lgb_to_pmml.py in create_node(obj, main_node, derived_col_names)
326 else:
327
--> 328 main_node.add_Node(create_left_node(obj,derived_col_names))
329 main_node.add_Node(create_right_node(obj,derived_col_names))
330

~/anaconda3/envs/galaxy_risk/lib/python3.6/site-packages/nyoka/lgbm/lgb_to_pmml.py in create_left_node(obj, derived_col_names)
310 nd.set_SimplePredicate(
311 pml.SimplePredicate(field=xgboostToPmml.replace_name_with_derivedColumnNames(derived_col_names[int(obj['split_feature'])],
--> 312 derived_col_names), operator=SIMPLE_PREDICATE_OPERATOR.LESS_OR_EQUAL.value, value="{:.16f}".format(obj['threshold'])))
313 create_node(obj['left_child'], nd, derived_col_names)
314 return nd

~/anaconda3/envs/galaxy_risk/lib/python3.6/site-packages/nyoka/xgboost/xgboost_to_pmml.py in replace_name_with_derivedColumnNames(original_name, derived_col_names)
289 new = str.replace(original_name, 'f', '')
290 if new.isdigit():
--> 291 col_name = derived_col_names[int(new)]
292 else:
293 col_name = original_name

IndexError: list index out of range`

Answer 1 · 2020-08-11T10:39:51.000Z

Thanks, @Kennylovetheworld for pointing this out. This will be taken care of in future releases.

Answer 2 · 2021-08-03T08:07:42.000Z

Hi @Kennylovetheworld, this is fixed in Release 5.0.1