Scipy sparse matrices support
alexeygrigorev opened this issue · 0 comments
alexeygrigorev commented
Heamy does not seem to support sparse matrices at the moment.
When I create a dataset where X_train
and X_test
are scipy sparse matrices, I get the following error:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-37-cc350d1da8a6> in <module>()
1 pipeline = ModelsPipeline(*classifiers)
----> 2 pipeline.stack()
/home/agrigorev/anaconda2/lib/python2.7/site-packages/heamy/pipeline.pyc in stack(self, k, stratify, shuffle, seed, full_test, add_diff)
131
132 for model in self.models:
--> 133 result = model.stack(k=k, stratify=stratify, shuffle=shuffle, seed=seed, full_test=full_test)
134 train_df = pd.DataFrame(result.X_train, columns=generate_columns(result.X_train, model.name))
135 test_df = pd.DataFrame(result.X_test, columns=generate_columns(result.X_test, model.name))
/home/agrigorev/anaconda2/lib/python2.7/site-packages/heamy/estimator.pyc in stack(self, k, stratify, shuffle, seed, full_test)
245 if self.use_cache:
246 pdict = {'k': k, 'stratify': stratify, 'shuffle': shuffle, 'seed': seed, 'full_test': full_test}
--> 247 dhash = self._dhash(pdict)
248 c = Cache(dhash, prefix='s')
249 if c.available:
/home/agrigorev/anaconda2/lib/python2.7/site-packages/heamy/estimator.pyc in _dhash(self, params)
132 """Get hash of the dictionary object."""
133 m = hashlib.new('md5')
--> 134 m.update(self.hash.encode('utf-8'))
135 for key in sorted(params.keys()):
136 h_string = ('%s-%s' % (key, params[key])).encode('utf-8')
/home/agrigorev/anaconda2/lib/python2.7/site-packages/heamy/estimator.pyc in hash(self)
78 m.update(h_string)
79 m.update(self.estimator_name.encode('utf-8'))
---> 80 m.update(self.dataset.hash.encode('utf-8'))
81
82 if not self._is_class:
/home/agrigorev/anaconda2/lib/python2.7/site-packages/heamy/dataset.pyc in hash(self)
235 m = hashlib.new('md5')
236 if self._preprocessor is None:
--> 237 m.update(numpy_buffer(self._X_train))
238 m.update(numpy_buffer(self._y_train))
239 if self._X_test is not None:
/home/agrigorev/anaconda2/lib/python2.7/site-packages/heamy/cache.pyc in numpy_buffer(ndarray)
55 ndarray = ndarray.values
56
---> 57 if ndarray.flags.c_contiguous:
58 obj_c_contiguous = ndarray
59 elif ndarray.flags.f_contiguous:
/home/agrigorev/anaconda2/lib/python2.7/site-packages/scipy/sparse/base.pyc in __getattr__(self, attr)
523 return self.getnnz()
524 else:
--> 525 raise AttributeError(attr + " not found")
526
527 def transpose(self):
AttributeError: flags not found
The matrices are obtained via DictVectorizer
from sklearn
As a temporary solution, I use X.toarray()