CCA is very small between a random net vs a pretrained one, bug?
brando90 opened this issue · 13 comments
I am getting this issue:
import anatome
print(anatome)
# from anatome import CCAHook
from anatome import SimilarityHook
model = resnet18(pretrained=True)
random_model = resnet18()
# random_model = resnet18().cuda()
# hook1 = CCAHook(model, "layer1.0.conv1")
# hook2 = CCAHook(random_model, "layer1.0.conv1")
cxa_dist_type = 'pwcca'
layer_name = "layer1.0.conv1"
hook1 = SimilarityHook(model, layer_name, cxa_dist_type)
hook2 = SimilarityHook(random_model, layer_name, cxa_dist_type)
with torch.no_grad():
model(data[0])
random_model(data[0])
distance_btw_nets = hook1.distance(hook2, size=8)
print(f'{distance_btw_nets=}')
distance_btw_nets = hook1.distance(hook2, size=None)
print(f'{distance_btw_nets=}')
<module 'anatome' from '/Users/brando/anaconda3/envs/metalearning/lib/python3.9/site-packages/anatome/__init__.py'>
distance_btw_nets=0.3089657425880432
distance_btw_nets=-2.468004822731018e-08
the second is suppose to use the full features but we see the error is much smaller when I expected it to increase by a lot since we are using more info since we didn't down sample.
Is this a bug?
"""
attempt at a colab: https://colab.research.google.com/drive/1GrhWrWFPmlc6kmxc0TJY0Nb6qOBBgjzX#scrollTo=KhUWNu3J_6i4
"""
#%%
# import torch
# import torchvision
# from torch.nn import functional as F
# from torchvision.models import resnet18
# from torchvision import transforms
# from torchvision.datasets import ImageFolder
# from torch.utils.data import DataLoader
#
# import matplotlib.pyplot as plt
#
# batch_size = 128
#
# model = resnet18(pretrained=True)
# imagenet = ImageFolder('~/.torch/data/imagenet/val',
# transforms.Compose([transforms.CenterCrop(224), transforms.ToTensor(),
# transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]))
# data = next(iter(DataLoader(imagenet, batch_size=batch_size, num_workers=8)))
#%%
import torch
import torchvision
from torch.nn import functional as F
from torchvision.models import resnet18
from torchvision import transforms
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
batch_size = 128
# imagenet = ImageFolder('~/.torch/data/imagenet/val',
# transforms.Compose([transforms.CenterCrop(224), transforms.ToTensor(),
# transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]))
# data = next(iter(DataLoader(imagenet, batch_size=batch_size, num_workers=8)))
transform = transforms.Compose(
[transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
batch_size = 128
num_workers = 0
trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
shuffle=True, num_workers=num_workers)
testset = torchvision.datasets.CIFAR10(root='./data', train=False,
download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
shuffle=False, num_workers=num_workers)
classes = ('plane', 'car', 'bird', 'cat',
'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
data = next(iter(trainloader))
#%%
import anatome
print(anatome)
# from anatome import CCAHook
from anatome import SimilarityHook
model = resnet18(pretrained=True)
random_model = resnet18()
# random_model = resnet18().cuda()
# hook1 = CCAHook(model, "layer1.0.conv1")
# hook2 = CCAHook(random_model, "layer1.0.conv1")
cxa_dist_type = 'pwcca'
layer_name = "layer1.0.conv1"
hook1 = SimilarityHook(model, layer_name, cxa_dist_type)
hook2 = SimilarityHook(random_model, layer_name, cxa_dist_type)
with torch.no_grad():
model(data[0])
random_model(data[0])
distance_btw_nets = hook1.distance(hook2, size=8)
print(f'{distance_btw_nets=}')
distance_btw_nets = hook1.distance(hook2, size=None)
print(f'{distance_btw_nets=}')
#%%
from meta_learning.base_models.learner_from_opt_as_few_shot_paper import Learner
from argparse import Namespace
args = Namespace()
# args.k_eval = 150
args.image_size = 84
args.bn_eps = 1e-3
args.bn_momentum = 0.95
args.n_classes = 5
args.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model1 = Learner(image_size=args.image_size, bn_eps=args.bn_eps, bn_momentum=args.bn_momentum, n_classes=args.n_classes).to(args.device)
model2 = Learner(image_size=args.image_size, bn_eps=args.bn_eps, bn_momentum=args.bn_momentum, n_classes=args.n_classes).to(args.device)
cxa_dist_type = 'pwcca'
layer_name = "model.features.conv1"
hook1 = SimilarityHook(model1, layer_name, cxa_dist_type)
hook2 = SimilarityHook(model2, layer_name, cxa_dist_type)
with torch.no_grad():
batch_x = data[0]
print(f'{batch_x.size()=}')
model1(batch_x)
model2(batch_x)
distance_btw_nets = hook1.distance(hook2, size=8)
print(f'{distance_btw_nets=}')
output:
distance_btw_nets=0.3089657425880432
distance_btw_nets=-2.468004822731018e-08
but the distance should increase when the dimensionality of the activations is increased - especially when one net is pre-trained while the other is random.
is this correct @moskomule ? shouldn't the distance increase not decrease as the size=8
goes to size=None
(second one uses the whole activations according to the code)
@moskomule sorry for the ping again...but just wanted to check if this was a bug or not.
Thanks for your time and patience, it's appreciated!
I'm not sure. I think several methods have been proposed to fix unexpected behaviors of previous works, so it could happen.
I'm not sure. I think several methods have been proposed to fix unexpected behaviors of previous works, so it could happen.
did you test your CCA code with for example scipy's results? (to make sure anatome's implementation is correct?)
No
No
how are you testing it?
@moskomule I ran a sanity check from the original google tutorial and your code seems to work! Thought you'd be happy to know:
https://github.com/google/svcca/blob/master/tutorials/001_Introduction.ipynb
as n->large svcca converge to the true svcca, plus at first it's very similar due to small # dat apoints.
actually this stopped working :( need to fix bug, library might not be working at all without this santity check.
ok it works now. Use this githash of anatome:
anatome git hash
c4c0691
c4c069183aca8aad6f73a4b7ab86f7f7e4ca3d04
Thanks for reporting. Happy to know that!
Thanks for reporting. Happy to know that!
No worries! will share code. seems I forgot
#%%
"""
The similarity of the same network should always be 1.0 on same input.
"""
import torch
import torch.nn as nn
import uutils.torch_uu
from uutils.torch_uu import cxa_sim, approx_equal
from uutils.torch_uu.models import get_named_identity_one_layer_linear_model
print('--- Sanity check: sCCA = 1.0 when using same net twice with same input. --')
Din: int = 10
Dout: int = Din
B: int = 2000
mdl1: nn.Module = get_named_identity_one_layer_linear_model(D=Din)
mdl2: nn.Module = mdl1
layer_name = 'fc0'
# cxa_dist_type = 'pwcca'
cxa_dist_type = 'svcca'
# - ends up comparing two matrices of size [B, Dout], on same data, on same model
X: torch.Tensor = torch.distributions.Normal(loc=0.0, scale=1.0).sample((B, Din))
sim: float = cxa_sim(mdl1, mdl2, X, layer_name, downsample_size=None, iters=1, cxa_dist_type=cxa_dist_type)
print(f'Should be very very close to 1.0: {sim=}')
print(f'Is it close to 1.0? {approx_equal(sim, 1.0)}')
assert(approx_equal(sim, 1.0))
#%%
"""
Reproducing: How many data points: https://github.com/google/svcca/blob/master/tutorials/001_Introduction.ipynb
As n increases, the cca sim should decrease until it converges to the true max linear correlation in the data.
This is because when D is small it's easy to correlate via Xw, Yw since there are less equations (m data) than unknown (D features).
Similarly, the similarity decreases because the more data there is, the more variation has to be captured and thus the less
correlation there will be.
This is correct because 1/4*E[|| Xw - Yw||^2]^2 is proportional the pearson's correlation (assuming Xw, Yw is standardized).
"""
from pathlib import Path
from matplotlib import pyplot as plt
import torch
import torch.nn as nn
import uutils
from uutils.torch_uu import cxa_sim, approx_equal
from uutils.torch_uu.models import get_named_one_layer_random_linear_model
import uutils.plot as uulot
print('\n--- Sanity check: when number of data points B is smaller than D, then it should be trivial to make similiarty 1.0 '
'(even if nets/matrices are different)')
B: int = 10
Dout: int = 300
mdl1: nn.Module = get_named_one_layer_random_linear_model(B, Dout)
mdl2: nn.Module = get_named_one_layer_random_linear_model(B, Dout)
layer_name = 'fc0'
# cxa_dist_type = 'pwcca'
cxa_dist_type = 'svcca'
# - get sim for B << D e.g. [B=10, D=300] easy to "fit", to many degrees of freedom
X: torch.Tensor = uutils.torch_uu.get_identity_data(B)
# mdl1(X) : [B, Dout] = [B, B] [B, Dout]
sim: float = cxa_sim(mdl1, mdl2, X, layer_name, downsample_size=None, iters=1, cxa_dist_type=cxa_dist_type)
print(f'Should be very very close to 1.0: {sim=} (since we have many features to match the two Xw1, Yw2).')
print(f'Is it close to 1.0? {approx_equal(sim, 1.0)}')
# assert(approx_equal(sim, 1.0))
print('\n-- Santity: just makes sure that when low data is present sim is high and afterwards (as n->infty) sim (CCA) '
'converges to the "true" cca value (eventually)')
# data_sizes: list[int] = [10, 25, 50, 100, 101, 200, 500, 1_000, 2_000, 5_000]
data_sizes: list[int] = [10, 25, 50, 100, 101, 200, 500, 1_000, 2_000, 5_000, 10_000]
# data_sizes: list[int] = [10, 25, 50, 100, 101, 200, 500, 1_000, 2_000, 5_000, 10_000, 50_000, 100_000]
# data_sizes: list[int] = [10, 25, 50, 100, 200, 500, 1_000, 2_000, 5_000, 10_000]
sims: list[float] = []
for b in data_sizes:
X: torch.Tensor = uutils.torch_uu.get_identity_data(b)
mdl1: nn.Module = get_named_one_layer_random_linear_model(b, Dout)
mdl2: nn.Module = get_named_one_layer_random_linear_model(b, Dout)
# print(f'{b=}')
sim: float = cxa_sim(mdl1, mdl2, X, layer_name, downsample_size=None, iters=1, cxa_dist_type=cxa_dist_type)
# print(f'{sim=}')
sims.append(sim)
print(f'{sims=}')
uulot.plot(x=data_sizes, y=sims, xlabel='number of data points (n)', ylabel='similarity (svcca)', show=True, save_plot=True, plot_filename='ndata_vs_svcca_sim', title='Features (D) vs Sim (SVCCA)', x_hline=Dout, x_hline_label=f'B=D={Dout}')
#%%
from pathlib import Path
from matplotlib import pyplot as plt
import torch
import torch.nn as nn
import uutils
from uutils.torch_uu import cxa_sim, approx_equal
from uutils.torch_uu.models import get_named_one_layer_random_linear_model
from uutils.plot import plot, save_to_desktop
import uutils.plot as uuplot
B: int = 10 # [101, 200, 500, 1000, 2000, 5000, 10000]
Din: int = B
Dout: int = 300
mdl1: nn.Module = get_named_one_layer_random_linear_model(Din, Dout)
mdl2: nn.Module = get_named_one_layer_random_linear_model(Din, Dout)
layer_name = 'fc0'
# cxa_dist_type = 'pwcca'
cxa_dist_type = 'svcca'
X: torch.Tensor = uutils.torch_uu.get_identity_data(B)
sim: float = cxa_sim(mdl1, mdl2, X, layer_name, downsample_size=None, iters=1, cxa_dist_type=cxa_dist_type)
print(f'Should be very very close to 1.0: {sim=}')
print(f'Is it close to 1.0? {approx_equal(sim, 1.0)}')
# data_sizes: list[int] = [10, 25, 50, 100, 101, 200, 500, 1_000, 2_000, 5_000, 10_000, 50_000]
B: int = 300
D_feature_sizes: list[int] = [10, 25, 50, 100, 101, 200, 500, 1_000, 2_000, 5_000, 10_000]
sims: list[float] = []
for d in D_feature_sizes:
X: torch.Tensor = uutils.torch_uu.get_identity_data(B)
mdl1: nn.Module = get_named_one_layer_random_linear_model(B, d)
mdl2: nn.Module = get_named_one_layer_random_linear_model(B, d)
sim: float = cxa_sim(mdl1, mdl2, X, layer_name, downsample_size=None, iters=1, cxa_dist_type=cxa_dist_type)
# print(f'{d=}, {sim=}')
sims.append(sim)
print(f'{sims=}')
uuplot.plot(x=D_feature_sizes, y=sims, xlabel='number of features/size of dimension (D)', ylabel='similarity (svcca)', show=True, save_plot=True, plot_filename='D_vs_sim_svcca', title='Features (D) vs Sim (SVCCA)', x_hline=B, x_hline_label=f'B=D={B}')
# uuplot.plot(x=D_feature_sizes, y=sims, xlabel='number of features/size of dimension (D)', ylabel='similarity (svcca)', show=True, save_plot=True, plot_filename='D_vs_sim', title='Features (D) vs Sim (SVCCA)')
should produce the plots above. If the plots reproduce we are good ;) :)