Trying to replicate with different data set
huongvu16 opened this issue · 3 comments
Hi Nicolo!
Thanks for posting the example of Gradient Boosting in Tensorflow. I am trying to replicate your model using different data set (lending club data - sample data used to run examples in h2o.ai) and trying to customize your code to fit this dataset (named 'processed.csv' here, from which I have deleted all rows that include 'nan' values)
By the way, I am only interested in running the Tensorflow model, not XGBoost.
I am running on Python 3.6 in Anaconda environment, Tensorflow version 1.4, on Mac OS X 10.12.6
The pre-processing part is as follows (I have omitted the all the imports)
cols=['loan_amnt', 'term','int_rate', 'emp_length','annual_inc','dti', 'delinq_2yrs',
'bad_loan','revol_util', 'total_acc', 'longest_credit_length',
'home_ownership','purpose', 'addr_state','verification_status']
def _get_df_from_file(file_name):
``` ```
df = pd.read_csv(file_name)
labels = df['bad_loan']
del df['bad_loan']
return df, labels
if __name__ == '__main__':
df, labels = _get_df_from_file('processed.csv')
X_train, X_test, y_train, y_test = train_test_split(
df, labels, test_size=0.25, random_state=42)
data = dict(
feature_names=df.columns,
X_train=X_train, y_train=y_train,
X_test=X_test, y_test=y_test,)
for k, v in data.items():
print(k, v.shape)
np.savez('processed.npz', **data)
Results of this is:
X_train (118497, 14)
y_train (118497,)
feature_names (14,)
X_test (39499, 14)
y_test (39499,)
Now the model in Tensorflow (all the imports omitted):
FLAGS = None
def _get_tfbt(output_dir,feature_cols):
learner_config = learner_pb2.LearnerConfig()
learner_config.learning_rate_tuner.fixed.learning_rate=FLAGS.learning_rate
learner_config.regularization.l1 = 0.0
learner_config.regularization.l2 = FLAGS.l2/FLAGS.batch_size
learner_config.constraints.max_tree_depth = FLAGS.depth
learner_config.growing_mode = learner_pb2.LearnerConfig.LAYER_BY_LAYER
run_config = tf.contrib.learn.RunConfig(save_checkpoints_secs=30)
estimator = GradientBoostedDecisionTreeClassifier(
learner_config=learner_config,
examples_per_layer=FLAGS.examples_per_layer,
n_classes=2,
num_trees=FLAGS.num_trees,
feature_columns=feature_cols,
model_dir=output_dir,
config=run_config,
center_bias=False)
return estimator
def _matrix_to_dict(matrix, col_names):
return{
feat_name: matrix[:,feat_idx,np.newaxis]
for feat_idx,feat_name in enumerate(col_names)}
def _make_input_fn(which_set):
data = np.load('processed.npz')
feature_names = data['feature_names']
feature_columns = [feature_column.real_valued_column(k) for k in feature_names]
if which_set == 'train':
return feature_columns,tf.estimator.inputs.numpy_input_fn(
x=_matrix_to_dict(data['X_train'],feature_names),
y=data['y_train'],
batch_size=100,
num_epochs=None,
shuffle=True)
elif which_set == 'test':
return feature_columns, tf.estimator.inputs.numpy_input_fn(
x=_matrix_to_dict(data['X_test'],feature_names),
y=data['y_test'],
num_epochs=1,
shuffle=False)
else:
raise NotImplementedError()
def _make_experiment_fn(output_dir):
feature_columns, train_input_fn = _make_input_fn('train')
feature_columns, test_input_fn = _make_input_fn('test')
return tf.contrib.learn.Experiment(
estimator=_get_tfbt(output_dir,feature_columns),
train_input_fn=train_input_fn,
eval_input_fn=test_input_fn,
train_steps=None,
eval_metrics=None,
eval_steps=None,)
def main(unused_argv):
learn_runner.run(
experiment_fn=_make_experiment_fn,
output_dir=FLAGS.output_dir,
schedule='train_and_evaluate')
feature_columns,test_input_fn = _make_input_fn('test')
estimator = _get_tfbt(FLAGS.output_dir,feature_columns)
results = estimator.predict(input_fn=test_input_fn)
y_predict = np.array([r['probabilities'][1] for r in results])
np.save(os.path.join(FLAGS.output_dir,'prediction_tf.npy'),y_predict)
if __name__ == '__main__':
tf.logging.set_verbosity(tf.logging.INFO)
parser = argparse.ArgumentParser()
parser.add_argument(
"--batch_size",
type=int,
default=10000,
help="The batch size for reading data.")
parser.add_argument(
"--depth",
type=int,
default=6,
help="Maximum depth of weak learners.")
parser.add_argument(
"--l2",
type=float,
default=1.0,
help="l2 regularization per batch.")
parser.add_argument(
"--learning_rate",
type=float,
default=0.1,
help="Learning rate (shrinkage weight) with which each new tree is added.")
parser.add_argument(
"--examples_per_layer",
type=int,
default=5000,
help="Number of examples to accumulate stats for per layer.")
parser.add_argument(
"--num_trees",
type=int,
default=10,
help="Number of trees to grow before stopping.")
FLAGS, unparsed = parser.parse_known_args()
FLAGS.output_dir = 'outputs/tf_t{:03d}_d{:02d}_ex{:05d}'.format(
FLAGS.num_trees, FLAGS.depth, FLAGS.examples_per_layer)
tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
And the result is:
INFO:tensorflow:Using config: {'_master': '', '_num_worker_replicas': 0, '_save_summary_steps': 100, '_tf_config': gpu_options {
per_process_gpu_memory_fraction: 1
}
, '_keep_checkpoint_every_n_hours': 10000, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x12c509438>, '_save_checkpoints_steps': None, '_task_id': 0, '_environment': 'local', '_log_step_count_steps': 100, '_model_dir': 'outputs/tf_t010_d06_ex05000', '_keep_checkpoint_max': 5, '_evaluation_master': '', '_is_chief': True, '_session_config': None, '_save_checkpoints_secs': 30, '_tf_random_seed': None, '_task_type': None, '_num_ps_replicas': 0}
INFO:tensorflow:Active Feature Columns: ['addr_state', 'annual_inc', 'delinq_2yrs', 'dti', 'emp_length', 'home_ownership', 'int_rate', 'loan_amnt', 'longest_credit_length', 'purpose', 'revol_util', 'term', 'total_acc', 'verification_status']
WARNING:tensorflow:Casting <dtype: 'int64'> labels to bool.
WARNING:tensorflow:Casting <dtype: 'int64'> labels to bool.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Restoring parameters from outputs/tf_t010_d06_ex05000/model.ckpt-0
INFO:tensorflow:Error reported to Coordinator: <class 'tensorflow.python.framework.errors_impl.InternalError'>, Unable to get element as bytes.
INFO:tensorflow:Saving checkpoints for 0 into outputs/tf_t010_d06_ex05000/model.ckpt.
WARNING:tensorflow:Error encountered when serializing resources.
Type is unsupported, or the types of the items don't match field type in CollectionDef.
'_Resource' object has no attribute 'name'
TypeError Traceback (most recent call last)
TypeError: expected bytes, float foundDuring handling of the above exception, another exception occurred:
SystemError Traceback (most recent call last)
.....
InternalError: Unable to get element as bytes.
From the codes I pasted above, do you have any pointer to which could be causing this problem?
Many thanks!
Did you end up solving this?
Hi, yes I fixed the issued with the categorical columns.
I'm trying to the also produce the prediction array for the training set as well, using the following code:
def main(unused_argv):
learn_runner.run(
experiment_fn=_make_experiment_fn,
output_dir=FLAGS.output_dir,
schedule='train_and_evaluate')
feature_columns,train_input_fn = _make_input_fn('train')
estimator = _get_tfbt(FLAGS.output_dir,feature_columns)
results = estimator.predict(input_fn=train_input_fn)
y_predict = np.array([r['probabilities'][1] for r in results])
np.save(os.path.join(FLAGS.output_dir,'train_prediction_tf.npy'),y_predict)
if __name__ == '__main__':
tf.logging.set_verbosity(tf.logging.INFO)
parser = argparse.ArgumentParser()
parser.add_argument(
"--batch_size",
type=int,
default=10000,
help="The batch size for reading data.")
parser.add_argument(
"--depth",
type=int,
default=6,
help="Maximum depth of weak learners.")
parser.add_argument(
"--l2",
type=float,
default=1.0,
help="l2 regularization per batch.")
parser.add_argument(
"--learning_rate",
type=float,
default=0.1,
help="Learning rate (shrinkage weight) with which each new tree is added.")
parser.add_argument(
"--examples_per_layer",
type=int,
default=5000,
help="Number of examples to accumulate stats for per layer.")
parser.add_argument(
"--num_trees",
type=int,
default=10,
help="Number of trees to grow before stopping.")
FLAGS, unparsed = parser.parse_known_args()
FLAGS.output_dir = 'outputs/tf_t{:03d}_d{:02d}_ex{:05d}'.format(
FLAGS.num_trees, FLAGS.depth, FLAGS.examples_per_layer)
tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
The kernel runs but seems to take forever (I leave it for an hour and when I come back it is still running - without any new logs though) Do you think this is because the X_train file is too big?
From my tests, it did seem very finicky, so I wouldn't be surprised if it's acting up. Try raising the verbosity options, maybe you'll get some more output.