[BUG] ask hackernews fails on faiss
lmeyerov opened this issue · 1 comments
lmeyerov commented
same issue? makes me think faiss version maybe
graphistry base 2.40.7 // faiss 1.7.2
cell
from time import time
t0 = time()
################################################################
## Two Lines of codes cuts through 80% of the datasciencing
df = df.sample(3000) # set smaller if you want to test a minibatch
################################################################
# create the graphistry instance
g = graphistry.nodes(df)
# set to False if you want to reload last trained instance
process = True
print('-'*80)
if process:
# Umap will create a similarity graph from the features which we can view as a graph
g2 = g.umap(X=['title'], # the features to encode (can add/remove 'text', etc)
y=['score'], # for demonstrative purposes, we include a target -- though this one is not really conditioned on textual features in a straightforward way
model_name='msmarco-distilbert-base-v2', #'paraphrase-MiniLM-L6-v2', etc, from sbert/Huggingface, the text encoding model
min_words = 0, # when 0 forces all X=[..] as textually encoded, higher values would ascertain if a column is textual or not depending on average number of words per column
use_ngrams=False, # set to True if you want ngram features instead (does not make great plots but useful for other situations)
use_scaler_target='standard', # for regressive targets
use_scaler=None, # there are many more settings see `g.featurize?` and `g.umap?` for further options
)
g2.save_search_instance('hn.search')
print(f'Encoding {df.shape[0]} records using {str(g2._node_encoder.text_model)[:19]} took {(time()-t0)/60:.2f} minutes')
else:
# or load the search instance
g2 = g.load_search_instance('hn.search')
print(f'Loaded saved instance')
################################################################
! Failed umap speedup attempt. Continuing without memoization speedups.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[11], line 26
16 if process:
17 # Umap will create a similarity graph from the features which we can view as a graph
18 g2 = g.umap(X=['title'], # the features to encode (can add/remove 'text', etc)
19 y=['score'], # for demonstrative purposes, we include a target -- though this one is not really conditioned on textual features in a straightforward way
20 model_name='msmarco-distilbert-base-v2', #'paraphrase-MiniLM-L6-v2', etc, from sbert/Huggingface, the text encoding model
(...)
24 use_scaler=None, # there are many more settings see `g.featurize?` and `g.umap?` for further options
25 )
---> 26 g2.save_search_instance('hn.search')
27 print(f'Encoding {df.shape[0]} records using {str(g2._node_encoder.text_model)[:19]} took {(time()-t0)/60:.2f} minutes')
28 else:
29 # or load the search instance
File /opt/conda/envs/rapids/lib/python3.8/site-packages/graphistry/text_utils.py:276, in SearchToGraphMixin.save_search_instance(self, savepath)
273 def save_search_instance(self, savepath):
274 from joblib import dump # type: ignore # need to make this onnx or similar
--> 276 self.build_index()
277 search = self.search_index
278 del self.search_index # can't pickle Annoy
File /opt/conda/envs/rapids/lib/python3.8/site-packages/graphistry/text_utils.py:46, in SearchToGraphMixin.build_index(self, angular, n_trees)
44 self.assert_features_line_up_with_nodes()
45 X = self._get_feature("nodes")
---> 46 self.search_index = FaissVectorSearch(
47 X.values
48 )
File /opt/conda/envs/rapids/lib/python3.8/site-packages/graphistry/ai_utils.py:153, in FaissVectorSearch.__init__(self, M)
151 import faiss
152 self.index = faiss.IndexFlatL2(M.shape[1])
--> 153 self.index.add(M.astype('float32'))
File /opt/conda/envs/rapids/lib/python3.8/site-packages/faiss/__init__.py:215, in handle_Index.<locals>.replacement_add(self, x)
213 n, d = x.shape
214 assert d == self.d
--> 215 self.add_c(n, swig_ptr(x))
File /opt/conda/envs/rapids/lib/python3.8/site-packages/faiss/swigfaiss.py:8779, in swig_ptr(a)
8778 def swig_ptr(a):
-> 8779 return _swigfaiss.swig_ptr(a)
ValueError: array is not C-contiguous
lmeyerov commented
updating faiss to 1.7.4 seems to help