/AugCoDa

Data Augmentation for Compositional Data

Primary LanguagePython

AugCoDa

Code to reproduce the results in Data Augmentation for Compositional Data: Advancing Predictive Models of the Microbiome.

To simply run the augmentations on some data:

def aitchison_mixup(X_train, y_train, factor=10, weight=0.5):
    """Applies Aitchison Mixup to some training data.

    Args:
        X_train: model inputs
        y_train: model outputs
        factor: the enlargement factor by which to augment X and y.
            If X, y have n rows, the augmented data will have factor * n.
        weight: the total weight of all the augmented data generated.
            For example, if weight=0.5, then all the augmented data will
            weigh the same as all the original training data.
            Typically, this means the synthetic samples are downweighted
            relative to the original data.
        
    Returns:
        X_aug: model inputs including both those from the original
            training data and those generated by data augmentation.
        y_aug: model outputs including both those from the original
            training data and those generated by data augmentation.
        w_aug: sample weights for each sample in X_aug and y_aug.
            Typically, the sample weights corresponding to samples
            from the original training data will have high weight,
            those corresponding to synthetic datapoints will have
            low weight.
    """

    X = X_train.copy()
    y = y_train.copy()
    w = np.ones_like(y)

    for val in y_train.unique():
        idxs = y_train == val
        X_temp = X_train[idxs, :]
        n = X_temp.shape[0]
        n_aug = int(factor * n) - n

        lam = np.random.rand(n_aug).reshape([-1, 1])
        idx1 = np.random.choice(n, size=n_aug)
        idx2 = np.random.choice(n, size=n_aug)

        # Take convex combination
        X_aug = lam * X_temp[idx1, :] + (1 - lam) * X_temp[idx2, :]

        X = np.concatenate([X, X_aug], axis=0)
        y = np.concatenate([y, np.repeat(val, n_aug)])
        w = np.concatenate([w, np.repeat(weight / (1 - weight) * X_train.shape[0] / n_aug, n_aug)])
    
    # Shuffle data
    n = X.shape[0]
    idx = np.arange(n)
    np.random.shuffle(idx)
    
    return X[idx], y[idx], w[idx]

def compositional_feature_dropout(X_train, y_train, factor=10, weight=0.5):
    """Applies Compositional Feature Dropout to some training data.

    Args:
        X_train: model inputs
        y_train: model outputs
        factor: the enlargement factor by which to augment X and y.
            If X, y have n rows, the augmented data will have factor * n.
        weight: the total weight of all the augmented data generated.
            For example, if weight=0.5, then all the augmented data will
            weigh the same as all the original training data.
            Typically, this means the synthetic samples are downweighted
            relative to the original data.
        
    Returns:
        X_aug: model inputs including both those from the original
            training data and those generated by data augmentation.
        y_aug: model outputs including both those from the original
            training data and those generated by data augmentation.
        w_aug: sample weights for each sample in X_aug and y_aug.
            Typically, the sample weights corresponding to samples
            from the original training data will have high weight,
            those corresponding to synthetic datapoints will have
            low weight.
    """
    X = X_train.copy()
    y = y_train.copy()
    w = np.ones_like(y)

    for val in [0, 1]:
        idxs = y_train == val
        X_temp = X_train[idxs, :]
        n = X_temp.shape[0]
        n_aug = int(factor * n) - n
        X_aug = []
        y_aug = []
        p = np.random.rand(n_aug)
        idx = np.random.choice(n, size= n_aug)
        mask = np.random.binomial(1, p, [X_temp.shape[1], n_aug]).T
        X_new = X_temp[idx, :].copy()
        X_new[mask.astype('bool')] = 1
        X_aug.append(X_new)
        y_aug.append(y_train[idx])
        X_aug = X_new
        y_aug = y_aug
        X = np.concatenate([X, X_aug], axis=0)
        y = np.concatenate([y, np.repeat(val, n_aug)])
        w = np.concatenate([w, np.repeat(weight / (1 - weight) * X_train.shape[0] / n_aug, n_aug)])

    # Shuffle data
    n = X.shape[0]
    idx = np.arange(n)
    np.random.shuffle(idx)
    
    return X[idx], y[idx], w[idx]

def compositional_cutmix(X_train, y_train, factor=10, weight=0.5):
    """Applies Compositional CutMix to some training data.

    Args:
        X_train: model inputs
        y_train: model outputs
        factor: the enlargement factor by which to augment X and y.
            If X, y have n rows, the augmented data will have factor * n.
        weight: the total weight of all the augmented data generated.
            For example, if weight=0.5, then all the augmented data will
            weigh the same as all the original training data.
            Typically, this means the synthetic samples are downweighted
            relative to the original data.
        
    Returns:
        X_aug: model inputs including both those from the original
            training data and those generated by data augmentation.
        y_aug: model outputs including both those from the original
            training data and those generated by data augmentation.
        w_aug: sample weights for each sample in X_aug and y_aug.
            Typically, the sample weights corresponding to samples
            from the original training data will have high weight,
            those corresponding to synthetic datapoints will have
            low weight.
    """
    X = X_train.copy()
    y = y_train.copy()
    w = np.ones_like(y)

    for val in [0, 1]:
        idxs = y_train == val
        X_temp = X_train[idxs, :]
        n = X_temp.shape[0]
        n_aug = int(factor * n - n

        idx1 = np.random.choice(n, size=n_aug)
        idx2 = np.random.choice(n, size=n_aug)

        p = np.random.rand(n_aug)
        mask = np.random.binomial(1, p, [X_temp.shape[1], n_aug]).T
        X_aug = mask * X_temp[idx1, :] + (1 - mask) * X_temp[idx2, :]

        X_aug = X_aug / X_aug.sum(axis=1, keepdims=True)

        X = np.concatenate([X, X_aug], axis=0)
        y = np.concatenate([y, np.repeat(val, n_aug)])
        w = np.concatenate([w, np.repeat(weight / (1 - weight) * X_train.shape[0] / n_aug, n_aug)])

    # Shuffle data
    n = X.shape[0]
    idx = np.arange(n)
    np.random.shuffle(idx)
    
    return X[idx], y[idx], w[idx]