has2k1/plotnine

geom_violin memory consumption

x1o opened this issue · 2 comments

x1o commented

Compared with geom_boxplot(), geom_violin() seems to consume an unreasonable amount of memory. I get this error trying to plot a 70091 x 2 dataframe:

Unable to allocate 36.6 GiB for an array with shape (70091, 70091) and data type float64

geom_violin densities are computed using statsmodels.nonparametric.kde.KDEUnivariate. Violins (densities) use more computing resources than boxplots. You have not posted a stack trace so I cannot tell which part of the pipeline is chocking, but a 70091 x 70091 array gives 70091 * 70091 * 64 / (8 * 1024**3) = 36.6 GiB. I do not think there we can do anything about it in plotnine.

Just to add that I tested this today. While a dataframe with shape (2903785, 3) cannot be plotted as violin_plot in plotnine, it can be in seaborn. Looking here I see that seaborn uses scipy.stats.gaussian_kde().

here is the output from plotnine

---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
/usr/lib/python3/dist-packages/IPython/core/formatters.py in __call__(self, obj)
    700                 type_pprinters=self.type_printers,
    701                 deferred_pprinters=self.deferred_printers)
--> 702             printer.pretty(obj)
    703             printer.flush()
    704             return stream.getvalue()

/usr/lib/python3/dist-packages/IPython/lib/pretty.py in pretty(self, obj)
    392                         if cls is not object \
    393                                 and callable(cls.__dict__.get('__repr__')):
--> 394                             return _repr_pprint(obj, self, cycle)
    395 
    396             return _default_pprint(obj, self, cycle)

/usr/lib/python3/dist-packages/IPython/lib/pretty.py in _repr_pprint(obj, p, cycle)
    682     """A pprint that just redirects to the normal repr function."""
    683     # Find newlines and replace them with p.break_()
--> 684     output = repr(obj)
    685     lines = output.splitlines()
    686     with p.group():

~/.local/lib/python3.8/site-packages/plotnine/ggplot.py in __repr__(self)
     86         # in the jupyter notebook.
     87         if not self.figure:
---> 88             self.draw()
     89         plt.show()
     90         return '<ggplot: (%d)>' % self.__hash__()

~/.local/lib/python3.8/site-packages/plotnine/ggplot.py in draw(self, return_ggplot)
    179         # new frames knowing that they are separate from the original.
    180         with pd.option_context('mode.chained_assignment', None):
--> 181             return self._draw(return_ggplot)
    182 
    183     def _draw(self, return_ggplot=False):

~/.local/lib/python3.8/site-packages/plotnine/ggplot.py in _draw(self, return_ggplot)
    186         # assign a default theme
    187         self = deepcopy(self)
--> 188         self._build()
    189 
    190         # If no theme we use the default

~/.local/lib/python3.8/site-packages/plotnine/ggplot.py in _build(self)
    297 
    298         # Apply and map statistics
--> 299         layers.compute_statistic(layout)
    300         layers.map_statistic(self)
    301 

~/.local/lib/python3.8/site-packages/plotnine/layer.py in compute_statistic(self, layout)
     83     def compute_statistic(self, layout):
     84         for l in self:
---> 85             l.compute_statistic(layout)
     86 
     87     def map_statistic(self, plot):

~/.local/lib/python3.8/site-packages/plotnine/layer.py in compute_statistic(self, layout)
    370         data = self.stat.use_defaults(data)
    371         data = self.stat.setup_data(data)
--> 372         data = self.stat.compute_layer(data, params, layout)
    373         self.data = data
    374 

~/.local/lib/python3.8/site-packages/plotnine/stats/stat.py in compute_layer(cls, data, params, layout)
    272             return cls.compute_panel(pdata, pscales, **params)
    273 
--> 274         return groupby_apply(data, 'PANEL', fn)
    275 
    276     @classmethod

~/.local/lib/python3.8/site-packages/plotnine/utils.py in groupby_apply(df, cols, func, *args, **kwargs)
    631         # function fn should be free to modify dataframe d, therefore
    632         # do not mark d as a slice of df i.e no SettingWithCopyWarning
--> 633         lst.append(func(d, *args, **kwargs))
    634     return pd.concat(lst, axis=axis, ignore_index=True)
    635 

~/.local/lib/python3.8/site-packages/plotnine/stats/stat.py in fn(pdata)
    270                 return pdata
    271             pscales = layout.get_scales(pdata['PANEL'].iat[0])
--> 272             return cls.compute_panel(pdata, pscales, **params)
    273 
    274         return groupby_apply(data, 'PANEL', fn)

~/.local/lib/python3.8/site-packages/plotnine/stats/stat_ydensity.py in compute_panel(cls, data, scales, **params)
    134     @classmethod
    135     def compute_panel(cls, data, scales, **params):
--> 136         data = super(cls, cls).compute_panel(data, scales, **params)
    137 
    138         if not len(data):

~/.local/lib/python3.8/site-packages/plotnine/stats/stat.py in compute_panel(cls, data, scales, **params)
    303         stats = []
    304         for _, old in data.groupby('group'):
--> 305             new = cls.compute_group(old, scales, **params)
    306             unique = uniquecols(old)
    307             missing = unique.columns.difference(new.columns)

~/.local/lib/python3.8/site-packages/plotnine/stats/stat_ydensity.py in compute_group(cls, data, scales, **params)
    166             range_y = scales.y.dimension()
    167 
--> 168         dens = compute_density(data['y'], weight, range_y, **params)
    169         dens['y'] = dens['x']
    170         dens['x'] = np.mean([data['x'].min(), data['x'].max()])

~/.local/lib/python3.8/site-packages/plotnine/stats/stat_density.py in compute_density(x, weight, range, **params)
    172         bw = nrd0(x)
    173     kde = sm.nonparametric.KDEUnivariate(x)
--> 174     kde.fit(
    175         kernel=params['kernel'],
    176         bw=bw,

~/.local/lib/python3.8/site-packages/statsmodels/nonparametric/kde.py in fit(self, kernel, bw, fft, weights, gridsize, adjust, cut, clip)
    176             )
    177         else:
--> 178             density, grid, bw = kdensity(
    179                 endog,
    180                 kernel=kernel,

~/.local/lib/python3.8/site-packages/statsmodels/nonparametric/kde.py in kdensity(x, kernel, bw, weights, gridsize, adjust, clip, cut, retgrid)
    423 
    424     k = (
--> 425         x.T - grid[:, None]
    426     ) / bw  # uses broadcasting to make a gridsize x nobs
    427 

MemoryError: Unable to allocate array with shape (442608, 442608) and data type float64