rushter/heamy

Scipy sparse matrices support

alexeygrigorev opened this issue · 0 comments

Heamy does not seem to support sparse matrices at the moment.

When I create a dataset where X_train and X_test are scipy sparse matrices, I get the following error:

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-37-cc350d1da8a6> in <module>()
      1 pipeline = ModelsPipeline(*classifiers)
----> 2 pipeline.stack()

/home/agrigorev/anaconda2/lib/python2.7/site-packages/heamy/pipeline.pyc in stack(self, k, stratify, shuffle, seed, full_test, add_diff)
    131 
    132         for model in self.models:
--> 133             result = model.stack(k=k, stratify=stratify, shuffle=shuffle, seed=seed, full_test=full_test)
    134             train_df = pd.DataFrame(result.X_train, columns=generate_columns(result.X_train, model.name))
    135             test_df = pd.DataFrame(result.X_test, columns=generate_columns(result.X_test, model.name))

/home/agrigorev/anaconda2/lib/python2.7/site-packages/heamy/estimator.pyc in stack(self, k, stratify, shuffle, seed, full_test)
    245         if self.use_cache:
    246             pdict = {'k': k, 'stratify': stratify, 'shuffle': shuffle, 'seed': seed, 'full_test': full_test}
--> 247             dhash = self._dhash(pdict)
    248             c = Cache(dhash, prefix='s')
    249             if c.available:

/home/agrigorev/anaconda2/lib/python2.7/site-packages/heamy/estimator.pyc in _dhash(self, params)
    132         """Get hash of the dictionary object."""
    133         m = hashlib.new('md5')
--> 134         m.update(self.hash.encode('utf-8'))
    135         for key in sorted(params.keys()):
    136             h_string = ('%s-%s' % (key, params[key])).encode('utf-8')

/home/agrigorev/anaconda2/lib/python2.7/site-packages/heamy/estimator.pyc in hash(self)
     78                 m.update(h_string)
     79             m.update(self.estimator_name.encode('utf-8'))
---> 80             m.update(self.dataset.hash.encode('utf-8'))
     81 
     82             if not self._is_class:

/home/agrigorev/anaconda2/lib/python2.7/site-packages/heamy/dataset.pyc in hash(self)
    235             m = hashlib.new('md5')
    236             if self._preprocessor is None:
--> 237                 m.update(numpy_buffer(self._X_train))
    238                 m.update(numpy_buffer(self._y_train))
    239                 if self._X_test is not None:

/home/agrigorev/anaconda2/lib/python2.7/site-packages/heamy/cache.pyc in numpy_buffer(ndarray)
     55         ndarray = ndarray.values
     56 
---> 57     if ndarray.flags.c_contiguous:
     58         obj_c_contiguous = ndarray
     59     elif ndarray.flags.f_contiguous:

/home/agrigorev/anaconda2/lib/python2.7/site-packages/scipy/sparse/base.pyc in __getattr__(self, attr)
    523             return self.getnnz()
    524         else:
--> 525             raise AttributeError(attr + " not found")
    526 
    527     def transpose(self):

AttributeError: flags not found

The matrices are obtained via DictVectorizer from sklearn

As a temporary solution, I use X.toarray()