/KG-Embeddings

Primary LanguageJupyter NotebookMIT LicenseMIT

Text-Embeddings

# Python program to generate embedding (word vectors) using Word2Vec

# importing necessary modules for embedding
pip install --upgrade gensim

import nltk
nltk.download('punkt')

# importing all necessary modules
from nltk.tokenize import sent_tokenize, word_tokenize
import warnings

warnings.filterwarnings(action = 'ignore')

import gensim
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

import numpy
import multiprocessing
import re, string # using to remove regular expression, special characters in txt files
# Read text file for embedding such as ‘Gravity_DBpedia.txt’ file
def input_text_file(file_path):   
   
   # Read text file
   sample = open(file_path, "r")
   s = sample.read()

   # Replaces escape character with space
   text = s.replace("\n", " ")
   return text
# define a function for removing unnecessary/special characters and return a lower case plain texts/ Monir
def word_drop(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W", " ", text)
    text = re.sub('https?://\S+ |www\.\S+', '', text)
    text = re.sub('<.*?>', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text
# Replaces escape character with space
f = s.replace("\n", " ")
# define a function to tokenize the sentence into words/ Monir
def word_tokenize(text):
    data = []

    # iterate through each sentence in the file
    for i in sent_tokenize(f):
        temp = []

        # tokenize the sentence into words
        for j in word_tokenize(i):
            temp.append(j.lower())

        data.append(temp)
    return data
    
# Create CBOW model
def create_embedding_model(data): 

  # Create CBOW Word2Ve model
  embedding_model = gensim.models.Word2Vec(data, min_count = 1, vector_size = 300, window = 10)

  return embedding_model
def embedding_main:
    
    # Read text file for embedding such as ‘Gravity_DBpedia.txt’ file
    file_path = "/content/drive/My Drive/MonirResearchDatasets/Gravity_DBpedia.txt"
    
    text = input_text_file(file_path)
    preprocessed_text = word_drop(text)
    text_to_token = word_tokenize(preprocessed_text)
    embedding_model = create_embedding_model(text_to_token)

KG-Embeddings

def embedding_word_clusters(model, list_of_ga_themes, cluster_size):
  keys = list_of_ga_themes
  GoogleNews_model = model
  n = cluster_size

  embedding_clusters = []
  word_clusters = []
  for word in keys:
      embeddings = []
      words = []
      for similar_word, _ in GoogleNews_model.most_similar(word, topn=n):
          words.append(similar_word)
          embeddings.append(GoogleNews_model[similar_word])
      embedding_clusters.append(embeddings)
      word_clusters.append(words)

  return embedding_clusters, word_clusters
  
Code here..............
# Script for creating a 2D scatter plot using Matplotlib library for data visualization in Python.
% matplotlib inline

def tsne_plot_similar_words(title, labels, embedding_clusters, word_clusters, a, filename=None):
    plt.figure(figsize=(16, 9))
    colors = cm.rainbow(np.linspace(0, 1, len(labels)))
    for label, embeddings, words, color in zip(labels, embedding_clusters, word_clusters, colors):
        x = embeddings[:, 0]
        y = embeddings[:, 1]
        plt.scatter(x, y, c=color, alpha=a, label=label)
        for i, word in enumerate(words):
            plt.annotate(word, alpha=0.5, xy=(x[i], y[i]), xytext=(5, 2),
                         textcoords='offset points', ha='right', va='bottom', size=8)
    plt.legend(loc=4)
    plt.title(title)
    plt.grid(True)
    if filename:
        plt.savefig(filename, format='png', dpi=150, bbox_inches='tight')
    plt.show()
Code here............

GA-themes extraction

pip install rdflib
import rdflib
from rdflib import Graph

g = Graph()
g.parse("/content/drive/My Drive/MonirResearchDatasets/surround-ga-records/ga-records.ttl", format='turtle')

print(len(g))
import collections
from collections import Counter
# a funtion for ga-themes extraction from GA-rdf repository separate and return a list all the ga-themes - Monir
def gaThemesExtraction(ga_record):
  gaThemes = []
  with open(ga_record, 'rt') as f:
    data = f.readlines()
  for line in data:
      # check if line contains "ga-themes" sub-string
      if line.__contains__('ga-themes'):
          # split the line contains from "ga-themes" sub-string
          stringTemp = line.split("ga-themes/",1)[1]
          # further split the line contains from "ga-themes" sub-string to delimiter
          stringTemp = stringTemp.split('>')[0]
          gaThemes.append(stringTemp)
  #print(dataLog)
  #print(gaThemes[:9])
  #print(len(gaThemes))
  return gaThemes
# a funtion imput a list of ga-themes and return a list of unique ga-themes and another list of duplicate gaThemes - 
def make_unique_gaThemes(list_all_ga_themes):
  # find a a list of unique ga-themes
  unique_gaThemes = []
  unique_gaThemes = list(dict.fromkeys(gaThemes))
  #print(len(unique_gaThemes))

  # a list of duplicate gaThemes
  duplicate_gaThemes = []
  duplicate_gaThemes = [item for item, count in collections.Counter(gaThemes).items() if count > 1]
  #print(len(duplicate_gaThemes))

  return unique_gaThemes, duplicate_gaThemes
  
# to connect to Google drive with colab
from google.colab import drive
drive.mount('/content/drive')
# to get all the ga-themes from ga-records.ttl file
ga_record_datapath = "/content/drive/My Drive/MonirResearchDatasets/surround-ga-records/ga-records.ttl"
gaThemes = gaThemesExtraction(ga_record_datapath)
print(gaThemes[:10])
print(len(gaThemes))
# to get all the ga-themes from all1K file 
ga_record_datapath = "/content/drive/My Drive/MonirResearchDatasets/surround-ga-records/all1k.ttl.txt"
gaThemes = gaThemesExtraction(ga_record_datapath)
print(gaThemes[:10])
print(len(gaThemes))
# to get all unique ga-themes
unique_gaThemes, duplicate_gaThemes = make_unique_gaThemes(gaThemes)
print(unique_gaThemes[:100])
#print(duplicate_gaThemes[:100])
print(len(unique_gaThemes))

KG_embeddings_with_ga_themes_visualization_Monir.ipynb