golang and python read an same bin file,get different vector
iris-qq opened this issue · 2 comments
iris-qq commented
when i use python3 gensim.models.KeyedVectors.load_word2vec_format
and golang word2vec.FromReader
to load bin file which trained by python3,i get different vector.any wrong in my coding?pls help.
python3
import os
import gensim
basePath = os.path.dirname(os.path.abspath(__file__))
filePath = os.path.join(
basePath, "static", "news_12g_baidubaike_20g_novel_90g_embedding_64.bin")
model = gensim.models.KeyedVectors.load_word2vec_format(
filePath, binary=True)
mylist = ['公司', '杩', '过去', '可能', '像', '竟然', '事', '们', '为了', '最后']
for item in mylist:
print(item, model[item])
python3 result
事 [ 5.20741463e-01 -6.25130892e+00 6.28790331e+00 5.21187305e-01
-4.82128191e+00 -2.63628817e+00 -8.96926224e-01 -1.69552505e-01
-3.48719239e+00 1.01413813e+01 8.82065475e-01 -2.30002832e+00
-2.53639960e+00 2.25287819e+00 6.64753675e+00 -3.75168061e+00
5.61822701e+00 -1.79358745e+00 -1.42449875e+01 -3.06644630e+00
-1.98697805e-01 6.02796507e+00 5.95407152e+00 9.19211960e+00
-7.91416168e+00 1.67050323e+01 3.24403763e+00 8.59381866e+00
6.22694492e+00 2.71203089e+00 2.42259574e+00 4.90009785e+00
1.08043420e+00 1.53470738e-02 -6.89982033e+00 -3.86542034e+00
-3.74263287e+00 8.82859039e+00 -4.24431896e+00 3.47348619e+00
2.21167183e+00 5.88018775e-01 -2.14199276e+01 9.24456656e-01
-2.98334181e-01 -7.56087303e+00 1.36260653e+01 -5.46317768e+00
4.61971760e+00 -1.25563478e+01 -1.04119625e+01 9.43718529e+00
7.70377064e+00 3.34251237e+00 5.42984724e+00 4.13672972e+00
-2.69478232e-01 -3.73712564e+00 -2.59542608e+00 3.42773581e+00
4.04233158e-01 1.37277615e+00 -3.78718829e+00 4.31480455e+00]
们 [-11.414904 8.840026 3.3249636 -3.6919358 -0.7717347
-3.8709686 -1.8805499 -0.7131454 3.6418455 3.3767622
3.3609931 -1.1868769 1.0058872 1.8745646 -8.834773
-5.1170874 8.119795 -5.340779 3.138594 3.4976473
1.7618842 9.266873 -0.9348149 -10.322832 -9.4384165
-8.672556 6.198186 5.2617297 -1.897403 6.845674
-6.630889 -6.245321 -7.5150595 9.888341 4.179519
6.4567084 -0.62740654 -0.543522 -11.651043 -1.536419
2.7479737 0.5292694 -1.3375239 0.18971145 -0.4524008
-3.3986962 2.8511124 1.9053295 8.0099945 3.8884878
-1.4678761 -5.1613317 -6.130191 -6.2907724 -0.86378443
2.1271715 1.6532385 0.9676462 -1.129918 -2.5658238
2.2253346 4.9628057 -5.314143 6.8483343 ]
为了 [-0.8325726 -4.4836063 8.727473 -1.343671 0.5271192 -0.25381362
-4.712411 -2.155193 2.8902082 6.753922 6.1289186 -0.7303832
16.596771 4.0743265 -2.5742066 3.1285803 -4.4062934 -4.2063847
-5.0557876 -3.7950578 -1.7724227 18.056515 5.4479613 -2.6919541
-8.849724 0.15571393 0.8785213 6.7841344 -1.057811 4.2452216
2.0949492 1.3838209 -8.502722 -5.2050786 -4.5449476 -5.742174
-1.3523334 5.2770963 2.5819428 8.246134 3.042666 -6.013532
2.112649 5.639371 -1.5676526 4.670214 2.030399 -6.7387877
-5.9026203 6.5401993 0.04721347 6.843486 5.7300572 -5.127986
2.8689466 6.3467226 2.6572473 3.1401997 -0.32292706 -3.2116556
-0.8051431 -3.0449674 -4.453163 1.566594 ]
最后 [-4.706695 -6.6112156 6.77635 -0.07057857 -3.4353297 -4.752667
-2.5630064 0.38487384 -5.909677 -5.2324333 0.28105256 2.6107652
-0.14168012 -2.2385676 -2.7318947 4.998936 -2.883193 0.87121415
-6.307436 -2.8092208 -1.2877645 10.510157 -2.96826 -3.5343978
0.32090768 -2.2008362 -0.8916678 6.8451 -3.7206984 -8.687311
-4.926895 -0.823666 0.10354326 -6.118841 0.07968843 6.116212
-1.8622543 0.24350323 -0.12790576 3.8704085 -1.2907848 -3.8995178
2.462558 1.9539595 -0.5990311 8.661512 -2.0175807 7.1245584
-3.8675046 5.37204 -6.6278043 0.1988717 2.2416093 -3.1289532
0.48402345 4.440447 -2.786379 1.0980356 -0.6593599 2.2009523
3.730244 -9.887996 4.0060644 5.718888 ]
golang
func main(){
basePath,err:= utils.InitBasePate()
if err!= nil{
log.Fatal("get base path error")
}
fileAbsPath := filepath.Join(basePath,word2BinPackageName)
word2vecReader,err := os.Open(fileAbsPath)
if err!= nil{
log.Fatal("open bing package error")
}
model,err:= word2vec.FromReader(word2vecReader)
if err != nil{
log.Fatal("read word vector error")
}
var words =[]string{"公司", "杩", "过去", "可能", "像", "竟然", "事", "们", "为了", "最后"}
vectors:= model.Map(words)
for _,item:= range words{
fmt.Println(item,vectors[item])
}
}
golang reuslt
事 [0.010038253 -0.12050552 0.12121095 0.010046847 -0.09293911 -0.050819322 -0.017289909 -0.0032684375 -0.06722207 0.19549385 0.017003441 -0.04433729 -0.048893783 0.04342837 0.12814355 -0.07232057 0.1083017 -0.03457471 -0.27459842 -0.059111413 -0.0038302671 0.11620016 0.114775725 0.17719507 -0.15256007 0.32202035 0.06253482 0.16566172 0.12003586 0.0522794 0.046700004 0.09445843 0.020827364 0.00029584317 -0.13300678 -0.074513115 -0.072146155 0.17018738 -0.08181708 0.06695786 0.04263406 0.011335148 -0.41290864 0.017820608 -0.005750942 -0.14574979 0.26266757 -0.10531284 0.089053586 -0.24204679 -0.2007098 0.18191917 0.1485044 0.0644331 0.10467033 0.0797431 -0.0051946905 -0.07204 -0.05003163 0.066075936 0.0077923406 0.026462795 -0.07300504 0.08317582]
们 [-0.27149034 0.21024983 0.079080425 -0.08780844 -0.01835482 -0.09206653 -0.04472671 -0.016961342 0.0866171 0.08031239 0.07993735 -0.028228499 0.023923866 0.04458436 -0.201249 -0.12170403 0.19311996 -0.12702426 0.07464784 0.08318751 0.041904386 0.2204019 -0.022233494 -0.24551667 -0.22448187 -0.20626675 0.14741671 0.12514418 -0.045127545 0.16281646 -0.15770805 -0.14853776 -0.17873703 0.23518279 0.09940505 0.15356538 -0.014922141 -0.012927044 -0.27710664 -0.036541957 0.06535739 0.012588062 -0.031811465 0.0045120683 -0.010759831 -0.080834076 0.06781042 0.045316067 0.19050848 0.0924832 -0.03491174 -0.122756325 -0.14579953 -0.14961877 -0.020544117 0.050592322 0.039320372 0.02301435 -0.026873797 -0.06102516 0.05292702 0.118034616 -0.12639077 0.16287974]
为了 [-0.019490412 -0.10496061 0.20430896 -0.031455155 0.012339789 -0.0059417426 -0.110316895 -0.050452776 0.06765938 0.1581084 0.14347716 -0.017098172 0.38852817 0.09537943 -0.060261827 0.07323964 -0.103150725 -0.09847089 -0.1183553 -0.08884179 -0.041492175 0.42270055 0.12753603 -0.06301828 -0.20717083 0.003645242 0.020566065 0.15881567 -0.024763212 0.09938006 0.04904248 0.032395057 -0.19904757 -0.121850185 -0.10639661 -0.13442351 -0.03165794 0.1235361 0.06044293 0.19304088 0.07122839 -0.14077596 0.049456824 0.13201691 -0.036698535 0.10932908 0.047531366 -0.15775411 -0.13817954 0.15310517 0.0011052609 0.16020507 0.13413987 -0.12004545 0.06716165 0.14857592 0.0622058 0.073511645 -0.0075596785 -0.07518442 -0.01884829 -0.071282275 -0.10424794 0.03667375]
最后 [-0.13794369 -0.19376132 0.19860107 -0.0020685147 -0.10068254 -0.13929102 -0.075116515 0.011279871 -0.17320064 -0.153352 0.008237081 0.07651623 -0.0041523566 -0.0656076 -0.08006629 0.14650868 -0.08450054 0.02553352 -0.18485814 -0.08233256 -0.037741765 0.30803135 -0.08699368 -0.103586026 0.009405153 -0.064502046 -0.026132973 0.20061599 -0.10904612 -0.25460747 -0.1443973 -0.024139978 0.003034643 -0.17933081 0.0023355063 0.17925376 -0.054578893 0.0071365857 -0.003748658 0.11343382 -0.037830286 -0.11428695 0.07217258 0.057266586 -0.017556386 0.25385135 -0.059131198 0.20880635 -0.113348715 0.15744358 -0.1942475 0.005828526 0.06569702 -0.091703266 0.014185745 0.13014048 -0.08166312 0.032181196 -0.0193245 0.06450544 0.10932588 -0.2897971 0.11740962 0.16760899]
iris-qq commented
this problem has sloved,i found in fromReader has v.Normalise()
,but vector load in python hasn't normalise.
dhowden commented
I think it's probably worth adding a comment to the method to explain that we normalise when we load models, to avoid this confusion in future.