[Core][Enhancement]: Add dummies to Aggregate

Question

[Core][Enhancement]: Add dummies to Aggregate

NudnikShpilkis opened this issue 10 months ago · 0 comments

Description

Since MLForecast can handle static features, aggregate should create the one-hot-encoded versions of the hierarchical variables and add dummies for aggregated levels. Here's an updated aggregate function and an example that adds those dummies.

import os
import sys
import warnings
import numpy as np
import pandas as pd
import hierarchicalforecast.methods as hfm
from datetime import datetime
from typing import Optional
from hierarchicalforecast.utils import aggregate, _to_upper_hierarchy
from hierarchicalforecast.core import HierarchicalReconciliation
from sklearn.preprocessing import OneHotEncoder
from scipy import sparse
from typing import Callable, Dict, List, Optional, Iterable

Y_df = (
    pd.read_csv('https://raw.githubusercontent.com/Nixtla/transfer-learning-time-series/main/datasets/tourism.csv')
    .query("State in ['Victoria', 'Tasmania']")
    .query("Purpose in ['Business', 'Holiday', 'Visiting']")
)
Y_df = Y_df.rename({'Trips': 'y', 'Quarter': 'ds'}, axis=1)
Y_df.insert(0, 'Country', 'Australia')
Y_df = Y_df[['Country', 'Region', 'State', 'Purpose', 'ds', 'y']]
Y_df['ds'] = Y_df['ds'].str.replace(r'(\d+) (Q\d)', r'\1-\2', regex=True)
Y_df['ds'] = pd.to_datetime(Y_df['ds'])
Y_df.head()

spec = [
    ['Country'],
    ['Country', 'State'], 
    ['Country', 'Purpose'], 
    ['Country', 'State', 'Region'], 
    ['Country', 'State', 'Purpose'], 
    ['Country', 'State', 'Region', 'Purpose']
]

def aggregate2(
    df: pd.DataFrame,
    spec: List[List[str]],
    is_balanced: bool = False,
    sparse_s: bool = False,
    add_dummies: bool = False
):
    """Utils Aggregation Function.
    Aggregates bottom level series contained in the pandas DataFrame `df` according
    to levels defined in the `spec` list.

    Parameters
    ----------
    df : pandas DataFrame
        Dataframe with columns `['ds', 'y']` and columns to aggregate.
    spec : list of list of str
        List of levels. Each element of the list should contain a list of columns of `df` to aggregate.
    is_balanced : bool (default=False)
        Deprecated.
    sparse_s : bool (default=False)
        Return `S_df` as a sparse dataframe.
    add_dummies: bool(default=False)
        Add dummy features for hierarchical variables.

    Returns
    -------
    Y_df : pandas DataFrame
        Hierarchically structured series.
    S_df : pandas DataFrame
        Summing dataframe.
    tags : dict
        Aggregation indices.
    """
    # Checks
    if df.isnull().values.any():
        raise ValueError('`df` contains null values')
    if is_balanced:
        warnings.warn(
            "`is_balanced` is deprecated and will be removed in a future version. "
            "Don't set this argument to suppress this warning.",
            category=DeprecationWarning,
        )
    spec = sorted(spec, key=len)
    if add_dummies:
        all_spec = set()
        all_spec = [x for x in [item for sublist in spec for item in sublist] if not (x in all_spec or all_spec.add(x))]
        all_dummies = [f'{i}_{j}' for i in all_spec for j in df[i].unique()]
        all_agg = [f'agg_{i}' for i in all_spec]
    bottom = spec[-1]
    aggs = []
    tags = {}
    for i, levels in enumerate(spec):
        agg = df.groupby(levels + ['ds'])['y'].sum().reset_index('ds')
        group = agg.index.get_level_values(0)
        agg[levels[0]] = agg.index.get_level_values(levels[0]).values
        for j, level in enumerate(levels):
            if j > 0:
                group = group + '/' + agg.index.get_level_values(level).str.replace('/', '_')
            # Add the variable and the one-hot encoded version of the variable
            if add_dummies:
                agg[level] = agg.index.get_level_values(level).values
                agg = pd.concat([agg, pd.get_dummies(agg[level], prefix=level, dtype=int)], axis=1)
        agg.index = group
        agg.index.name = 'unique_id'
        tags['/'.join(levels)] = group.unique().values
        
        # Add the aggregated levels for each hierarchical feature and fill in the one-hot encoded variables
        if add_dummies:
            for j in all_spec:
                if j not in agg:
                    agg[j] = np.NaN
                agg[f'agg_{j}'] = agg[j].isna().astype(int)

            for j in all_dummies:
                if j not in agg:
                    agg[j] = 0
        aggs.append(agg)
    
    dummies_cols = all_spec + all_dummies + all_agg if add_dummies else []
    Y_df = (
        pd.concat(aggs)
        [['ds', 'y'] + dummies_cols]
    )

    # construct S
    bottom_key = '/'.join(bottom)
    bottom_levels = tags[bottom_key]
    S = np.empty((len(bottom_levels), len(spec)), dtype=object)
    for j, levels in enumerate(spec[:-1]):
        S[:, j] = _to_upper_hierarchy(bottom, bottom_levels, '/'.join(levels))
    S[:, -1] = tags[bottom_key]
    categories = list(tags.values())
    try:
        encoder = OneHotEncoder(categories=categories, sparse_output=sparse_s, dtype=np.float32)
    except TypeError:  # sklearn < 1.2
        encoder = OneHotEncoder(categories=categories, sparse=sparse_s, dtype=np.float32)    
    S = encoder.fit_transform(S).T
    if sparse_s:
        df_constructor = pd.DataFrame.sparse.from_spmatrix
    else:
        df_constructor = pd.DataFrame
    S_df = df_constructor(S, index=np.hstack(categories), columns=bottom_levels)
    return Y_df, S_df, tags

Y_df2, S_df, tags = aggregate2(Y_df, spec, add_dummies=False)
Y_df2 = Y_df2.reset_index()
Y_df2.head()

Y_df2, S_df, tags = aggregate2(Y_df, spec, add_dummies=True)
Y_df2 = Y_df2.reset_index()
Y_df2.head()

Use case

Improve MLForecast model performance by allowing users to include static features made up of the hierarchical variables.