has2k1/plotnine

stat_summary_bin multiple errors

Closed this issue · 1 comments

This may actually be two separate issues, but since I'm not sure of the second one I'll condense them both in one.

stat_summary_bin's documentation seems to say that bins, binwidth, and breaks are effectively exclusive (since breaks > overrides binwidth > overrides bins), and that all of them can be tuples. From the docs:

class stat_summary_bin(stat):
    """
    Summarise y values at x intervals

    {usage}

    Parameters
    ----------
    {common_parameters}
    binwidth : float | tuple, default=None
        The width of the bins. The default is to use bins bins that
        cover the range of the data. You should always override this
        value, exploring multiple widths to find the best to illustrate
        the stories in your data.
    bins : int | tuple, default=30
        Number of bins. Overridden by binwidth.
    breaks : array_like | tuple[array_like, array_like], default=None
        Bin boundaries. This supercedes the `binwidth`, `bins`
        and `boundary` arguments.
    ...

Well, neither of these things seem to be true in 0.13.6.

bins, binwidth and breaks

When passing binwidth such that the resulting number of bins is < 30, or breaks with less than 30 bins, then unless bins is also specified to be correct we get an IndexError:

Traceback
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
Cell In[251], line 53
     30 g = (
     31     p9.ggplot(condense_for_onset(dev))
     32     + p9.aes(x=\"prog_onset_P\", y=\"temperature\")
   (...)
     50     # + p9.geom_line(data=manual_fit_data())
     51 )
     52 g
---> 53 g.draw(True)
     54 g._build()
     55 g.layers[0].data

File ~/.cache/micromamba/lib/python3.12/site-packages/plotnine/ggplot.py:272, in ggplot.draw(self, show)
    270 self = deepcopy(self)
    271 with plot_context(self, show=show):
--> 272     self._build()
    274     # setup
    275     self.figure, self.axs = self.facet.setup(self)

File ~/.cache/micromamba/lib/python3.12/site-packages/plotnine/ggplot.py:376, in ggplot._build(self)
    373 layout.map_position(layers)
    375 # Apply and map statistics
--> 376 layers.compute_statistic(layout)
    377 layers.map_statistic(self)
    379 # Prepare data in geoms
    380 # e.g. from y and width to ymin and ymax

File ~/.cache/micromamba/lib/python3.12/site-packages/plotnine/layer.py:461, in Layers.compute_statistic(self, layout)
    459 def compute_statistic(self, layout: Layout):
    460     for l in self:
--> 461         l.compute_statistic(layout)

File ~/.cache/micromamba/lib/python3.12/site-packages/plotnine/layer.py:284, in layer.compute_statistic(self, layout)
    282 data = self.stat.use_defaults(data)
    283 data = self.stat.setup_data(data)
--> 284 data = self.stat.compute_layer(data, params, layout)
    285 self.data = data

File ~/.cache/micromamba/lib/python3.12/site-packages/plotnine/stats/stat.py:308, in stat.compute_layer(cls, data, params, layout)
    305     pscales = layout.get_scales(pdata[\"PANEL\"].iloc[0])
    306     return cls.compute_panel(pdata, pscales, **params)
--> 308 return groupby_apply(data, \"PANEL\", fn)

File ~/.cache/micromamba/lib/python3.12/site-packages/plotnine/_utils/__init__.py:666, in groupby_apply(df, cols, func, *args, **kwargs)
    662 lst = []
    663 for _, d in df.groupby(cols, observed=True):
    664     # function fn should be free to modify dataframe d, therefore
    665     # do not mark d as a slice of df i.e no SettingWithCopyWarning
--> 666     lst.append(func(d, *args, **kwargs))
    667 return pd.concat(lst, axis=axis, ignore_index=True)

File ~/.cache/micromamba/lib/python3.12/site-packages/plotnine/stats/stat.py:306, in stat.compute_layer.<locals>.fn(pdata)
    304     return pdata
    305 pscales = layout.get_scales(pdata[\"PANEL\"].iloc[0])
--> 306 return cls.compute_panel(pdata, pscales, **params)

File ~/.cache/micromamba/lib/python3.12/site-packages/plotnine/stats/stat.py:343, in stat.compute_panel(cls, data, scales, **params)
    341 stats = []
    342 for _, old in data.groupby(\"group\"):
--> 343     new = cls.compute_group(old, scales, **params)
    344     unique = uniquecols(old)
    345     missing = unique.columns.difference(new.columns)

File ~/.cache/micromamba/lib/python3.12/site-packages/plotnine/stats/stat_summary_bin.py:168, in stat_summary_bin.compute_group(cls, data, scales, **params)
    166     out[\"width\"] = 0.9
    167 else:
--> 168     out[\"width\"] = np.diff(breaks)[bins - 1]
    170 return out

IndexError: index 29 is out of bounds for axis 0 with size 19

Code for clarity:

import plotnine as p9

# this works, and gives 30 bins:
p9.ggplot(df) + p9.aes(x="x", y="y") + p9.stat_summary_bin()

# this fails unless the number of resulting bins is > 30:
p9.ggplot(df) + p9.aes(x="x", y="y") + p9.stat_summary_bin(binwidth=10)

# this also fails:
p9.ggplot(df) + p9.aes(x="x", y="y") + p9.stat_summary_bin(breaks = np.linspace(df.x.min(), df.x.max(), 29)) # any int smaller than 30 fails

tuples

When passing a tuple to any of the three, the function just fails with a traceback like this one:

Traceback 2
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[252], line 53
     30 g = (
     31     p9.ggplot(condense_for_onset(dev))
     32     + p9.aes(x=\"prog_onset_P\", y=\"temperature\")
   (...)
     50     # + p9.geom_line(data=manual_fit_data())
     51 )
     52 g
---> 53 g.draw(True)
     54 g._build()
     55 g.layers[0].data

File ~/.cache/micromamba/lib/python3.12/site-packages/plotnine/ggplot.py:272, in ggplot.draw(self, show)
    270 self = deepcopy(self)
    271 with plot_context(self, show=show):
--> 272     self._build()
    274     # setup
    275     self.figure, self.axs = self.facet.setup(self)

File ~/.cache/micromamba/lib/python3.12/site-packages/plotnine/ggplot.py:376, in ggplot._build(self)
    373 layout.map_position(layers)
    375 # Apply and map statistics
--> 376 layers.compute_statistic(layout)
    377 layers.map_statistic(self)
    379 # Prepare data in geoms
    380 # e.g. from y and width to ymin and ymax

File ~/.cache/micromamba/lib/python3.12/site-packages/plotnine/layer.py:461, in Layers.compute_statistic(self, layout)
    459 def compute_statistic(self, layout: Layout):
    460     for l in self:
--> 461         l.compute_statistic(layout)

File ~/.cache/micromamba/lib/python3.12/site-packages/plotnine/layer.py:284, in layer.compute_statistic(self, layout)
    282 data = self.stat.use_defaults(data)
    283 data = self.stat.setup_data(data)
--> 284 data = self.stat.compute_layer(data, params, layout)
    285 self.data = data

File ~/.cache/micromamba/lib/python3.12/site-packages/plotnine/stats/stat.py:308, in stat.compute_layer(cls, data, params, layout)
    305     pscales = layout.get_scales(pdata[\"PANEL\"].iloc[0])
    306     return cls.compute_panel(pdata, pscales, **params)
--> 308 return groupby_apply(data, \"PANEL\", fn)

File ~/.cache/micromamba/lib/python3.12/site-packages/plotnine/_utils/__init__.py:666, in groupby_apply(df, cols, func, *args, **kwargs)
    662 lst = []
    663 for _, d in df.groupby(cols, observed=True):
    664     # function fn should be free to modify dataframe d, therefore
    665     # do not mark d as a slice of df i.e no SettingWithCopyWarning
--> 666     lst.append(func(d, *args, **kwargs))
    667 return pd.concat(lst, axis=axis, ignore_index=True)

File ~/.cache/micromamba/lib/python3.12/site-packages/plotnine/stats/stat.py:306, in stat.compute_layer.<locals>.fn(pdata)
    304     return pdata
    305 pscales = layout.get_scales(pdata[\"PANEL\"].iloc[0])
--> 306 return cls.compute_panel(pdata, pscales, **params)

File ~/.cache/micromamba/lib/python3.12/site-packages/plotnine/stats/stat.py:343, in stat.compute_panel(cls, data, scales, **params)
    341 stats = []
    342 for _, old in data.groupby(\"group\"):
--> 343     new = cls.compute_group(old, scales, **params)
    344     unique = uniquecols(old)
    345     missing = unique.columns.difference(new.columns)

File ~/.cache/micromamba/lib/python3.12/site-packages/plotnine/stats/stat_summary_bin.py:143, in stat_summary_bin.compute_group(cls, data, scales, **params)
    133 boundary = params[\"boundary\"]
    135 func = make_summary_fun(
    136     params[\"fun_data\"],
    137     params[\"fun_y\"],
   (...)
    140     params[\"fun_args\"],
    141 )
--> 143 breaks = fuzzybreaks(scales.x, breaks, boundary, binwidth, bins)
    144 data[\"bin\"] = pd.cut(
    145     data[\"x\"],
    146     bins=breaks,  # pyright: ignore
    147     labels=False,
    148     include_lowest=True,
    149 )
    151 def func_wrapper(data: pd.DataFrame) -> pd.DataFrame:

File ~/.cache/micromamba/lib/python3.12/site-packages/plotnine/stats/binning.py:305, in fuzzybreaks(scale, breaks, boundary, binwidth, bins, right)
    299     bins = int(np.ceil((srange[1] - boundary) / binwidth))
    301 # To minimise precision errors, we do not pass the boundary and
    302 # binwidth into np.arange as params. The resulting breaks
    303 # can then be adjusted with finer(epsilon based rather than
    304 # some arbitrary small number) precision.
--> 305 breaks = np.arange(boundary, srange[1] + binwidth, binwidth)
    306 return _adjust_breaks(breaks, right)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()"

I haven't looked in the code in depth, but my guess is that there is no handling for tuples.

Fix in 7a25a92.