graphistry/pygraphistry

[BUG] ask hackernews fails on faiss

lmeyerov opened this issue · 1 comments

same issue? makes me think faiss version maybe


graphistry base 2.40.7 // faiss 1.7.2

cell

from time import time
t0 = time()
################################################################
## Two Lines of codes cuts through 80% of the datasciencing 

df = df.sample(3000) # set smaller if you want to test a minibatch 

################################################################
# create the graphistry instance
g = graphistry.nodes(df)

# set to False if you want to reload last trained instance
process = True

print('-'*80)
if process:
    # Umap will create a similarity graph from the features which we can view as a graph
    g2 = g.umap(X=['title'], # the features to encode (can add/remove 'text', etc)
                y=['score'], # for demonstrative purposes, we include a target -- though this one is not really conditioned on textual features in a straightforward way
                model_name='msmarco-distilbert-base-v2', #'paraphrase-MiniLM-L6-v2', etc, from sbert/Huggingface, the text encoding model
                min_words = 0, # when 0 forces all X=[..] as textually encoded, higher values would ascertain if a column is textual or not depending on average number of words per column
                use_ngrams=False, # set to True if you want ngram features instead (does not make great plots but useful for other situations)
                use_scaler_target='standard', # for regressive targets
                use_scaler=None, # there are many more settings see `g.featurize?` and `g.umap?` for further options
               )
    g2.save_search_instance('hn.search')
    print(f'Encoding {df.shape[0]} records using {str(g2._node_encoder.text_model)[:19]} took {(time()-t0)/60:.2f} minutes')
else:
    # or load the search instance
    g2 = g.load_search_instance('hn.search')
    print(f'Loaded saved instance')
    
################################################################
! Failed umap speedup attempt. Continuing without memoization speedups.
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[11], line 26
     16 if process:
     17     # Umap will create a similarity graph from the features which we can view as a graph
     18     g2 = g.umap(X=['title'], # the features to encode (can add/remove 'text', etc)
     19                 y=['score'], # for demonstrative purposes, we include a target -- though this one is not really conditioned on textual features in a straightforward way
     20                 model_name='msmarco-distilbert-base-v2', #'paraphrase-MiniLM-L6-v2', etc, from sbert/Huggingface, the text encoding model
   (...)
     24                 use_scaler=None, # there are many more settings see `g.featurize?` and `g.umap?` for further options
     25                )
---> 26     g2.save_search_instance('hn.search')
     27     print(f'Encoding {df.shape[0]} records using {str(g2._node_encoder.text_model)[:19]} took {(time()-t0)/60:.2f} minutes')
     28 else:
     29     # or load the search instance

File /opt/conda/envs/rapids/lib/python3.8/site-packages/graphistry/text_utils.py:276, in SearchToGraphMixin.save_search_instance(self, savepath)
    273 def save_search_instance(self, savepath):
    274     from joblib import dump  # type: ignore   # need to make this onnx or similar
--> 276     self.build_index()
    277     search = self.search_index
    278     del self.search_index  # can't pickle Annoy

File /opt/conda/envs/rapids/lib/python3.8/site-packages/graphistry/text_utils.py:46, in SearchToGraphMixin.build_index(self, angular, n_trees)
     44 self.assert_features_line_up_with_nodes()
     45 X = self._get_feature("nodes")
---> 46 self.search_index = FaissVectorSearch(
     47     X.values
     48 )

File /opt/conda/envs/rapids/lib/python3.8/site-packages/graphistry/ai_utils.py:153, in FaissVectorSearch.__init__(self, M)
    151 import faiss
    152 self.index = faiss.IndexFlatL2(M.shape[1])
--> 153 self.index.add(M.astype('float32'))

File /opt/conda/envs/rapids/lib/python3.8/site-packages/faiss/__init__.py:215, in handle_Index.<locals>.replacement_add(self, x)
    213 n, d = x.shape
    214 assert d == self.d
--> 215 self.add_c(n, swig_ptr(x))

File /opt/conda/envs/rapids/lib/python3.8/site-packages/faiss/swigfaiss.py:8779, in swig_ptr(a)
   8778 def swig_ptr(a):
-> 8779     return _swigfaiss.swig_ptr(a)

ValueError: array is not C-contiguous

updating faiss to 1.7.4 seems to help