sajari/word2vec

golang and python read an same bin file,get different vector

iris-qq opened this issue · 2 comments

when i use python3 gensim.models.KeyedVectors.load_word2vec_formatand golang word2vec.FromReader to load bin file which trained by python3,i get different vector.any wrong in my coding?pls help.
python3

import os
import gensim

basePath = os.path.dirname(os.path.abspath(__file__))
filePath = os.path.join(
    basePath, "static", "news_12g_baidubaike_20g_novel_90g_embedding_64.bin")

model = gensim.models.KeyedVectors.load_word2vec_format(
    filePath, binary=True)
mylist = ['公司', '杩', '过去', '可能', '像', '竟然', '事', '们', '为了', '最后']
for item in mylist:
    print(item, model[item])

python3 result

事 [ 5.20741463e-01 -6.25130892e+00  6.28790331e+00  5.21187305e-01
 -4.82128191e+00 -2.63628817e+00 -8.96926224e-01 -1.69552505e-01
 -3.48719239e+00  1.01413813e+01  8.82065475e-01 -2.30002832e+00
 -2.53639960e+00  2.25287819e+00  6.64753675e+00 -3.75168061e+00
  5.61822701e+00 -1.79358745e+00 -1.42449875e+01 -3.06644630e+00
 -1.98697805e-01  6.02796507e+00  5.95407152e+00  9.19211960e+00
 -7.91416168e+00  1.67050323e+01  3.24403763e+00  8.59381866e+00
  6.22694492e+00  2.71203089e+00  2.42259574e+00  4.90009785e+00
  1.08043420e+00  1.53470738e-02 -6.89982033e+00 -3.86542034e+00
 -3.74263287e+00  8.82859039e+00 -4.24431896e+00  3.47348619e+00
  2.21167183e+00  5.88018775e-01 -2.14199276e+01  9.24456656e-01
 -2.98334181e-01 -7.56087303e+00  1.36260653e+01 -5.46317768e+00
  4.61971760e+00 -1.25563478e+01 -1.04119625e+01  9.43718529e+00
  7.70377064e+00  3.34251237e+00  5.42984724e+00  4.13672972e+00
 -2.69478232e-01 -3.73712564e+00 -2.59542608e+00  3.42773581e+00
  4.04233158e-01  1.37277615e+00 -3.78718829e+00  4.31480455e+00]
们 [-11.414904     8.840026     3.3249636   -3.6919358   -0.7717347
  -3.8709686   -1.8805499   -0.7131454    3.6418455    3.3767622
   3.3609931   -1.1868769    1.0058872    1.8745646   -8.834773
  -5.1170874    8.119795    -5.340779     3.138594     3.4976473
   1.7618842    9.266873    -0.9348149  -10.322832    -9.4384165
  -8.672556     6.198186     5.2617297   -1.897403     6.845674
  -6.630889    -6.245321    -7.5150595    9.888341     4.179519
   6.4567084   -0.62740654  -0.543522   -11.651043    -1.536419
   2.7479737    0.5292694   -1.3375239    0.18971145  -0.4524008
  -3.3986962    2.8511124    1.9053295    8.0099945    3.8884878
  -1.4678761   -5.1613317   -6.130191    -6.2907724   -0.86378443
   2.1271715    1.6532385    0.9676462   -1.129918    -2.5658238
   2.2253346    4.9628057   -5.314143     6.8483343 ]
为了 [-0.8325726  -4.4836063   8.727473   -1.343671    0.5271192  -0.25381362
 -4.712411   -2.155193    2.8902082   6.753922    6.1289186  -0.7303832
 16.596771    4.0743265  -2.5742066   3.1285803  -4.4062934  -4.2063847
 -5.0557876  -3.7950578  -1.7724227  18.056515    5.4479613  -2.6919541
 -8.849724    0.15571393  0.8785213   6.7841344  -1.057811    4.2452216
  2.0949492   1.3838209  -8.502722   -5.2050786  -4.5449476  -5.742174
 -1.3523334   5.2770963   2.5819428   8.246134    3.042666   -6.013532
  2.112649    5.639371   -1.5676526   4.670214    2.030399   -6.7387877
 -5.9026203   6.5401993   0.04721347  6.843486    5.7300572  -5.127986
  2.8689466   6.3467226   2.6572473   3.1401997  -0.32292706 -3.2116556
 -0.8051431  -3.0449674  -4.453163    1.566594  ]
最后 [-4.706695   -6.6112156   6.77635    -0.07057857 -3.4353297  -4.752667
 -2.5630064   0.38487384 -5.909677   -5.2324333   0.28105256  2.6107652
 -0.14168012 -2.2385676  -2.7318947   4.998936   -2.883193    0.87121415
 -6.307436   -2.8092208  -1.2877645  10.510157   -2.96826    -3.5343978
  0.32090768 -2.2008362  -0.8916678   6.8451     -3.7206984  -8.687311
 -4.926895   -0.823666    0.10354326 -6.118841    0.07968843  6.116212
 -1.8622543   0.24350323 -0.12790576  3.8704085  -1.2907848  -3.8995178
  2.462558    1.9539595  -0.5990311   8.661512   -2.0175807   7.1245584
 -3.8675046   5.37204    -6.6278043   0.1988717   2.2416093  -3.1289532
  0.48402345  4.440447   -2.786379    1.0980356  -0.6593599   2.2009523
  3.730244   -9.887996    4.0060644   5.718888  ]

golang

func main(){
	basePath,err:= utils.InitBasePate()
	if err!= nil{
		log.Fatal("get base path error")
	}
	fileAbsPath := filepath.Join(basePath,word2BinPackageName)
	word2vecReader,err := os.Open(fileAbsPath)
	if err!= nil{
		log.Fatal("open bing package error")
	}
	model,err:= word2vec.FromReader(word2vecReader)
	if err != nil{
		log.Fatal("read word vector error")
	}
	var words =[]string{"公司", "杩", "过去", "可能", "像", "竟然", "事", "们", "为了", "最后"}
	vectors:= model.Map(words)
	for _,item:= range words{
		fmt.Println(item,vectors[item])
	}
}

golang reuslt

事 [0.010038253 -0.12050552 0.12121095 0.010046847 -0.09293911 -0.050819322 -0.017289909 -0.0032684375 -0.06722207 0.19549385 0.017003441 -0.04433729 -0.048893783 0.04342837 0.12814355 -0.07232057 0.1083017 -0.03457471 -0.27459842 -0.059111413 -0.0038302671 0.11620016 0.114775725 0.17719507 -0.15256007 0.32202035 0.06253482 0.16566172 0.12003586 0.0522794 0.046700004 0.09445843 0.020827364 0.00029584317 -0.13300678 -0.074513115 -0.072146155 0.17018738 -0.08181708 0.06695786 0.04263406 0.011335148 -0.41290864 0.017820608 -0.005750942 -0.14574979 0.26266757 -0.10531284 0.089053586 -0.24204679 -0.2007098 0.18191917 0.1485044 0.0644331 0.10467033 0.0797431 -0.0051946905 -0.07204 -0.05003163 0.066075936 0.0077923406 0.026462795 -0.07300504 0.08317582]
们 [-0.27149034 0.21024983 0.079080425 -0.08780844 -0.01835482 -0.09206653 -0.04472671 -0.016961342 0.0866171 0.08031239 0.07993735 -0.028228499 0.023923866 0.04458436 -0.201249 -0.12170403 0.19311996 -0.12702426 0.07464784 0.08318751 0.041904386 0.2204019 -0.022233494 -0.24551667 -0.22448187 -0.20626675 0.14741671 0.12514418 -0.045127545 0.16281646 -0.15770805 -0.14853776 -0.17873703 0.23518279 0.09940505 0.15356538 -0.014922141 -0.012927044 -0.27710664 -0.036541957 0.06535739 0.012588062 -0.031811465 0.0045120683 -0.010759831 -0.080834076 0.06781042 0.045316067 0.19050848 0.0924832 -0.03491174 -0.122756325 -0.14579953 -0.14961877 -0.020544117 0.050592322 0.039320372 0.02301435 -0.026873797 -0.06102516 0.05292702 0.118034616 -0.12639077 0.16287974]
为了 [-0.019490412 -0.10496061 0.20430896 -0.031455155 0.012339789 -0.0059417426 -0.110316895 -0.050452776 0.06765938 0.1581084 0.14347716 -0.017098172 0.38852817 0.09537943 -0.060261827 0.07323964 -0.103150725 -0.09847089 -0.1183553 -0.08884179 -0.041492175 0.42270055 0.12753603 -0.06301828 -0.20717083 0.003645242 0.020566065 0.15881567 -0.024763212 0.09938006 0.04904248 0.032395057 -0.19904757 -0.121850185 -0.10639661 -0.13442351 -0.03165794 0.1235361 0.06044293 0.19304088 0.07122839 -0.14077596 0.049456824 0.13201691 -0.036698535 0.10932908 0.047531366 -0.15775411 -0.13817954 0.15310517 0.0011052609 0.16020507 0.13413987 -0.12004545 0.06716165 0.14857592 0.0622058 0.073511645 -0.0075596785 -0.07518442 -0.01884829 -0.071282275 -0.10424794 0.03667375]
最后 [-0.13794369 -0.19376132 0.19860107 -0.0020685147 -0.10068254 -0.13929102 -0.075116515 0.011279871 -0.17320064 -0.153352 0.008237081 0.07651623 -0.0041523566 -0.0656076 -0.08006629 0.14650868 -0.08450054 0.02553352 -0.18485814 -0.08233256 -0.037741765 0.30803135 -0.08699368 -0.103586026 0.009405153 -0.064502046 -0.026132973 0.20061599 -0.10904612 -0.25460747 -0.1443973 -0.024139978 0.003034643 -0.17933081 0.0023355063 0.17925376 -0.054578893 0.0071365857 -0.003748658 0.11343382 -0.037830286 -0.11428695 0.07217258 0.057266586 -0.017556386 0.25385135 -0.059131198 0.20880635 -0.113348715 0.15744358 -0.1942475 0.005828526 0.06569702 -0.091703266 0.014185745 0.13014048 -0.08166312 0.032181196 -0.0193245 0.06450544 0.10932588 -0.2897971 0.11740962 0.16760899]

this problem has sloved,i found in fromReader has v.Normalise(),but vector load in python hasn't normalise.

I think it's probably worth adding a comment to the method to explain that we normalise when we load models, to avoid this confusion in future.