geom_violin memory consumption
x1o opened this issue · 2 comments
Compared with geom_boxplot()
, geom_violin()
seems to consume an unreasonable amount of memory. I get this error trying to plot a 70091 x 2 dataframe:
Unable to allocate 36.6 GiB for an array with shape (70091, 70091) and data type float64
geom_violin
densities are computed using statsmodels.nonparametric.kde.KDEUnivariate. Violins (densities) use more computing resources than boxplots. You have not posted a stack trace so I cannot tell which part of the pipeline is chocking, but a 70091 x 70091 array gives 70091 * 70091 * 64 / (8 * 1024**3) = 36.6 GiB
. I do not think there we can do anything about it in plotnine.
Just to add that I tested this today. While a dataframe with shape (2903785, 3)
cannot be plotted as violin_plot in plotnine, it can be in seaborn. Looking here I see that seaborn uses scipy.stats.gaussian_kde()
.
here is the output from plotnine
---------------------------------------------------------------------------
MemoryError Traceback (most recent call last)
/usr/lib/python3/dist-packages/IPython/core/formatters.py in __call__(self, obj)
700 type_pprinters=self.type_printers,
701 deferred_pprinters=self.deferred_printers)
--> 702 printer.pretty(obj)
703 printer.flush()
704 return stream.getvalue()
/usr/lib/python3/dist-packages/IPython/lib/pretty.py in pretty(self, obj)
392 if cls is not object \
393 and callable(cls.__dict__.get('__repr__')):
--> 394 return _repr_pprint(obj, self, cycle)
395
396 return _default_pprint(obj, self, cycle)
/usr/lib/python3/dist-packages/IPython/lib/pretty.py in _repr_pprint(obj, p, cycle)
682 """A pprint that just redirects to the normal repr function."""
683 # Find newlines and replace them with p.break_()
--> 684 output = repr(obj)
685 lines = output.splitlines()
686 with p.group():
~/.local/lib/python3.8/site-packages/plotnine/ggplot.py in __repr__(self)
86 # in the jupyter notebook.
87 if not self.figure:
---> 88 self.draw()
89 plt.show()
90 return '<ggplot: (%d)>' % self.__hash__()
~/.local/lib/python3.8/site-packages/plotnine/ggplot.py in draw(self, return_ggplot)
179 # new frames knowing that they are separate from the original.
180 with pd.option_context('mode.chained_assignment', None):
--> 181 return self._draw(return_ggplot)
182
183 def _draw(self, return_ggplot=False):
~/.local/lib/python3.8/site-packages/plotnine/ggplot.py in _draw(self, return_ggplot)
186 # assign a default theme
187 self = deepcopy(self)
--> 188 self._build()
189
190 # If no theme we use the default
~/.local/lib/python3.8/site-packages/plotnine/ggplot.py in _build(self)
297
298 # Apply and map statistics
--> 299 layers.compute_statistic(layout)
300 layers.map_statistic(self)
301
~/.local/lib/python3.8/site-packages/plotnine/layer.py in compute_statistic(self, layout)
83 def compute_statistic(self, layout):
84 for l in self:
---> 85 l.compute_statistic(layout)
86
87 def map_statistic(self, plot):
~/.local/lib/python3.8/site-packages/plotnine/layer.py in compute_statistic(self, layout)
370 data = self.stat.use_defaults(data)
371 data = self.stat.setup_data(data)
--> 372 data = self.stat.compute_layer(data, params, layout)
373 self.data = data
374
~/.local/lib/python3.8/site-packages/plotnine/stats/stat.py in compute_layer(cls, data, params, layout)
272 return cls.compute_panel(pdata, pscales, **params)
273
--> 274 return groupby_apply(data, 'PANEL', fn)
275
276 @classmethod
~/.local/lib/python3.8/site-packages/plotnine/utils.py in groupby_apply(df, cols, func, *args, **kwargs)
631 # function fn should be free to modify dataframe d, therefore
632 # do not mark d as a slice of df i.e no SettingWithCopyWarning
--> 633 lst.append(func(d, *args, **kwargs))
634 return pd.concat(lst, axis=axis, ignore_index=True)
635
~/.local/lib/python3.8/site-packages/plotnine/stats/stat.py in fn(pdata)
270 return pdata
271 pscales = layout.get_scales(pdata['PANEL'].iat[0])
--> 272 return cls.compute_panel(pdata, pscales, **params)
273
274 return groupby_apply(data, 'PANEL', fn)
~/.local/lib/python3.8/site-packages/plotnine/stats/stat_ydensity.py in compute_panel(cls, data, scales, **params)
134 @classmethod
135 def compute_panel(cls, data, scales, **params):
--> 136 data = super(cls, cls).compute_panel(data, scales, **params)
137
138 if not len(data):
~/.local/lib/python3.8/site-packages/plotnine/stats/stat.py in compute_panel(cls, data, scales, **params)
303 stats = []
304 for _, old in data.groupby('group'):
--> 305 new = cls.compute_group(old, scales, **params)
306 unique = uniquecols(old)
307 missing = unique.columns.difference(new.columns)
~/.local/lib/python3.8/site-packages/plotnine/stats/stat_ydensity.py in compute_group(cls, data, scales, **params)
166 range_y = scales.y.dimension()
167
--> 168 dens = compute_density(data['y'], weight, range_y, **params)
169 dens['y'] = dens['x']
170 dens['x'] = np.mean([data['x'].min(), data['x'].max()])
~/.local/lib/python3.8/site-packages/plotnine/stats/stat_density.py in compute_density(x, weight, range, **params)
172 bw = nrd0(x)
173 kde = sm.nonparametric.KDEUnivariate(x)
--> 174 kde.fit(
175 kernel=params['kernel'],
176 bw=bw,
~/.local/lib/python3.8/site-packages/statsmodels/nonparametric/kde.py in fit(self, kernel, bw, fft, weights, gridsize, adjust, cut, clip)
176 )
177 else:
--> 178 density, grid, bw = kdensity(
179 endog,
180 kernel=kernel,
~/.local/lib/python3.8/site-packages/statsmodels/nonparametric/kde.py in kdensity(x, kernel, bw, weights, gridsize, adjust, clip, cut, retgrid)
423
424 k = (
--> 425 x.T - grid[:, None]
426 ) / bw # uses broadcasting to make a gridsize x nobs
427
MemoryError: Unable to allocate array with shape (442608, 442608) and data type float64