Memory error on Consumer_complaints.ipynb

Question

Memory error on Consumer_complaints.ipynb

credo99 opened this issue 7 years ago · 3 comments

MemoryError Traceback (most recent call last)
in ()
3 tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
4
----> 5 features = tfidf.fit_transform(df.Consumer_complaint_narrative).toarray()
6 labels = df.category_id
7 features.shape

~\Anaconda3\lib\site-packages\scipy\sparse\compressed.py in toarray(self, order, out)
945 if out is None and order is None:
946 order = self._swap('cf')[0]
--> 947 out = self._process_toarray_args(order, out)
948 if not (out.flags.c_contiguous or out.flags.f_contiguous):
949 raise ValueError('Output array must be C or F contiguous')

~\Anaconda3\lib\site-packages\scipy\sparse\base.py in _process_toarray_args(self, order, out)
1182 return out
1183 else:
-> 1184 return np.zeros(self.shape, dtype=self.dtype, order=order)
1185
1186

MemoryError:

Answer 1 · 2019-02-05T21:17:51.000Z

Apparently it's a well-known memory leak from Python Tfidfvectorizer fit_transform method when a sparse array is converted to a dense array leading on huge text corpus to filling 100s of GBytes of RAM. Most of the people recommend eliminating the toarray() conversion since most of the classifiers (except Gaussian) accept also sparse arrays as fitting. When I eliminated the toarray() method there was no more an memory error but the features.shape output increased brutally to (364203, 698312) from what the original notebook describes (4569, 12633). I can't figure out why, if the elimination of the toarray() is wrong then the code must be modified since the memory error is strange (on my 64 GB laptop I have almost 30 GB free RAM in the task manager but the Jupyter notebook complains everytime about memory error)

Answer 2 · 2019-02-05T21:21:00.000Z

Sorry, I wrongfully closed the issue

Answer 3 · 2019-12-11T06:21:21.000Z

I'm also facing the issue. What's the solution?

MemoryError Traceback (most recent call last)
in
6
7 # Perform cross validation
----> 8 results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
9
10 # Get results & append algorithm name

C:\ProgramData\Anaconda3\lib\site-packages\surprise\model_selection\validation.py in cross_validate(algo, data, measures, cv, return_train_measures, n_jobs, pre_dispatch, verbose)
99 return_train_measures)
100 for (trainset, testset) in cv.split(data))
--> 101 out = Parallel(n_jobs=n_jobs, pre_dispatch=pre_dispatch)(delayed_list)
102
103 (test_measures_dicts,

C:\ProgramData\Anaconda3\lib\site-packages\joblib\parallel.py in call(self, iterable)
1001 # remaining jobs.
1002 self._iterating = False
-> 1003 if self.dispatch_one_batch(iterator):
1004 self._iterating = self._original_iterator is not None
1005

C:\ProgramData\Anaconda3\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
832 return False
833 else:
--> 834 self._dispatch(tasks)
835 return True
836

C:\ProgramData\Anaconda3\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
751 with self._lock:
752 job_idx = len(self._jobs)
--> 753 job = self._backend.apply_async(batch, callback=cb)
754 # A job can complete so quickly than its callback is
755 # called before we get here, causing self._jobs to

C:\ProgramData\Anaconda3\lib\site-packages\joblib_parallel_backends.py in apply_async(self, func, callback)
199 def apply_async(self, func, callback=None):
200 """Schedule a func to be run"""
--> 201 result = ImmediateResult(func)
202 if callback:
203 callback(result)

C:\ProgramData\Anaconda3\lib\site-packages\joblib_parallel_backends.py in init(self, batch)
580 # Don't delay the application, to avoid keeping the input
581 # arguments in memory
--> 582 self.results = batch()
583
584 def get(self):

C:\ProgramData\Anaconda3\lib\site-packages\joblib\parallel.py in call(self)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def len(self):

C:\ProgramData\Anaconda3\lib\site-packages\joblib\parallel.py in (.0)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def len(self):

C:\ProgramData\Anaconda3\lib\site-packages\surprise\model_selection\validation.py in fit_and_score(algo, trainset, testset, measures, return_train_measures)
162
163 start_fit = time.time()
--> 164 algo.fit(trainset)
165 fit_time = time.time() - start_fit
166 start_test = time.time()

C:\ProgramData\Anaconda3\lib\site-packages\surprise\prediction_algorithms\knns.py in fit(self, trainset)
97
98 SymmetricAlgo.fit(self, trainset)
---> 99 self.sim = self.compute_similarities()
100
101 return self

C:\ProgramData\Anaconda3\lib\site-packages\surprise\prediction_algorithms\algo_base.py in compute_similarities(self)
246 if getattr(self, 'verbose', False):
247 print('Computing the {0} similarity matrix...'.format(name))
--> 248 sim = construction_funcname
249 if getattr(self, 'verbose', False):
250 print('Done computing similarity matrix.')

C:\ProgramData\Anaconda3\lib\site-packages\surprise\similarities.pyx in surprise.similarities.msd()

MemoryError: Unable to allocate array with shape (3185, 3185) and data type int32