Why running clean_captions.py returns empty files for val.json, test.json, and news_dataset.json?
Closed this issue · 1 comments
monajalal commented
[jalal@goku GoodNews]$ cd data/
[jalal@goku data]$ ls
total 9270700
4 -rw-r--r--. 1 jalal cs-grad 71 Sep 9 11:25 .gitignore
36744 -rw-r--r--. 1 jalal cs-grad 37622426 Sep 9 11:27 article_urls.json
1008000 -rw-r--r--. 1 jalal cs-grad 1032190641 Sep 9 11:27 captioning_dataset.json
8225936 -rw-r--r--. 1 jalal cs-grad 8423357695 Sep 9 11:29 resized.tar.gz
4 drwxr-xr-x. 12 jalal cs-grad 4096 Sep 9 12:12 ..
4 -rw-r--r--. 1 jalal cs-grad 2 Sep 9 13:50 val.json
4 -rw-r--r--. 1 jalal cs-grad 2 Sep 9 13:50 test.json
4 -rw-r--r--. 1 jalal cs-grad 2 Sep 9 13:50 news_dataset.json
0 drwxr-xr-x. 2 jalal cs-grad 192 Sep 9 13:50 .
The code is:
[jalal@goku data]$ cd ../prepocess/
[jalal@goku prepocess]$ cat clean_captions.py
import json
import nltk
import spacy
import numpy as np
import tqdm
import unidecode
from bs4 import BeautifulSoup
import re
import unicodedata
from itertools import groupby
def remove_non_ascii(words):
"""Remove non-ASCII characters from list of tokenized words"""
new_words = []
for word in words:
new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
new_words.append(new_word)
return new_words
def to_lowercase(words):
"""Convert all characters to lowercase from list of tokenized words"""
new_words = []
for word in words:
new_word = word.lower()
new_words.append(new_word)
return new_words
def remove_punctuation(words):
"""Remove punctuation from list of tokenized words"""
new_words = []
for word in words:
new_word = re.sub(r'[^\w\s]', '', word)
if new_word != '':
new_words.append(new_word)
return new_words
def replace_numbers(words):
"""Replace all interger occurrences in list of tokenized words with textual representation"""
p = inflect.engine()
new_words = []
for word in words:
if word.isdigit():
new_word = p.number_to_words(word)
new_words.append(new_word)
else:
new_words.append(word)
return new_words
def remove_stopwords(words):
"""Remove stop words from list of tokenized words"""
new_words = []
for word in words:
if word not in stopwords.words('english'):
new_words.append(word)
return new_words
# def stem_words(words):
# """Stem words in list of tokenized words"""
# stemmer = LancasterStemmer()
# stems = []
# for word in words:
# stem = stemmer.stem(word)
# stems.append(stem)
# return stems
#
# def lemmatize_verbs(words):
# """Lemmatize verbs in list of tokenized words"""
# lemmatizer = WordNetLemmatizer()
# lemmas = []
# for word in words:
# lemma = lemmatizer.lemmatize(word, pos='v')
# lemmas.append(lemma)
# return lemmas
def normalize(words):
words = remove_non_ascii(words)
# words = to_lowercase(words)
words = remove_punctuation(words)
# words = replace_numbers(words)
# words = remove_stopwords(words)
return words
def strip_html(text):
soup = BeautifulSoup(text, "html.parser")
return soup.get_text()
def remove_between_square_brackets(text):
return re.sub('\[[^]]*\]', '', text)
def denoise_text(text):
text = strip_html(text)
text = remove_between_square_brackets(text)
return text
def replace_contractions(text):
"""Replace contractions in string of text"""
return contractions.fix(text)
def preprocess_sentence(sen):
sen = sen.strip()
# sen = re.sub(uri_re, "", sen)
sen = sen.encode('ascii',errors='ignore')
sen = unidecode.unidecode(sen)
sen = denoise_text(sen)
# sen = replace_contractions(sen)
sen = nltk.tokenize.word_tokenize(sen)
sen = normalize([unicode(s) for s in sen])
# sen = normalize(unicode(sen))
# return sen
return sen
# tokenized = nltk.tokenize.word_tokenize(temp)
# final = normalize(unicode(tokenized))
# return ''.join(final)
# def NER(sen):
# doc = nlp(unicode(sen))
# return [d.ent_iob_+'-'+d.ent_type_ if d.ent_iob_ != 'O' else d.text for d in doc ], [d.text for d in doc]
def NER(sen):
doc = nlp(unicode(sen))
# text = doc.text
# for ent in doc.ents:
# text = text.replace(ent.text, ent.label_+'_')
tokens = [d.text for d in doc]
# [ent.merge(ent.root.tag_, ent.text, ent.label_) for ent in doc.ents]
# return compact([d.ent_iob_+'-'+d.ent_type_ if d.ent_iob_ != 'O' else d.text for d in doc ]), tokens
# return text, tokens
temp = [d.ent_type_+'_' if d.ent_iob_ != 'O' else d.text for d in doc]
return [x[0] for x in groupby(temp)], tokens
def get_split():
rand = np.random.uniform()
if rand > 0.95:
split = 'test'
# test_num += 1
elif rand > 0.91 and rand < 0.95:
split = 'val'
# val_num += 1
else:
split = 'train'
# train_num += 1
return split
if __name__ == '__main__':
np.random.seed(42)
nlp = spacy.load('en', disable=['parser', 'tagger'])
print('Loading spacy modules.')
news_data = []
counter = 0
test_num, val_num, train_num = 0, 0, 0
print('Loading the json.')
with open("../data/captioning_dataset.json", "rb") as f:
captioning_dataset = json.load(f)
for k, anns in tqdm.tqdm(captioning_dataset.items()):
for ix, img in anns['images'].items():
try:
split = get_split()
# import ipdb; ipdb.set_trace()
img = preprocess_sentence(img)
template, full = NER(' '.join(img))
if len(' '.join(template)) != 0:
news_data.append({'filename': k + '_' + ix + '.jpg', 'filepath': 'resized', 'cocoid': counter,
'imgid': k + '_' + ix, 'sentences': [], 'sentences_full': [],
# 'sentences_article':[],
'split': split})
news_data[counter]['sentences'].append(
{'imgid': counter, 'raw': ' '.join(template), 'tokens': template})
news_data[counter]['sentences_full'].append(
{'imgid': counter, 'raw': ' '.join(full), 'tokens': full})
counter += 1
except:
print(img)
split_to_ix = {i:n['split'] for i, n in enumerate(news_data)}
# train = [news_data[k] for k, v in split_to_ix.items() if v =='train']
val = [news_data[k] for k, v in split_to_ix.items() if v =='val']
test = [news_data[k] for k, v in split_to_ix.items() if v =='test']
with open("../data/test.json", "w") as f:
json.dump(test, f)
with open("../data/val.json", "w") as f:
json.dump(val, f)
with open("../data/news_dataset.json", "w") as f:
json.dump(news_data, f)
monajalal commented
This code worked for me which includes multiple changes such as:
sen = normalize([unicode(s) for s in sen]) to sen = normalize([str(s) for s in sen])
and
sen = sen.encode('ascii',errors='ignore')
sen = unidecode.unidecode(sen)
[jalal@goku prepocess]$ cat clean_captions.py
import json
import nltk
import spacy
import numpy as np
import tqdm
import unidecode
from bs4 import BeautifulSoup
import re
import unicodedata
from itertools import groupby
def remove_non_ascii(words):
"""Remove non-ASCII characters from list of tokenized words"""
new_words = []
for word in words:
new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
new_words.append(new_word)
return new_words
def to_lowercase(words):
"""Convert all characters to lowercase from list of tokenized words"""
new_words = []
for word in words:
new_word = word.lower()
new_words.append(new_word)
return new_words
def remove_punctuation(words):
"""Remove punctuation from list of tokenized words"""
new_words = []
for word in words:
new_word = re.sub(r'[^\w\s]', '', word)
if new_word != '':
new_words.append(new_word)
return new_words
def replace_numbers(words):
"""Replace all interger occurrences in list of tokenized words with textual representation"""
p = inflect.engine()
new_words = []
for word in words:
if word.isdigit():
new_word = p.number_to_words(word)
new_words.append(new_word)
else:
new_words.append(word)
return new_words
def remove_stopwords(words):
"""Remove stop words from list of tokenized words"""
new_words = []
for word in words:
if word not in stopwords.words('english'):
new_words.append(word)
return new_words
# def stem_words(words):
# """Stem words in list of tokenized words"""
# stemmer = LancasterStemmer()
# stems = []
# for word in words:
# stem = stemmer.stem(word)
# stems.append(stem)
# return stems
#
# def lemmatize_verbs(words):
# """Lemmatize verbs in list of tokenized words"""
# lemmatizer = WordNetLemmatizer()
# lemmas = []
# for word in words:
# lemma = lemmatizer.lemmatize(word, pos='v')
# lemmas.append(lemma)
# return lemmas
def normalize(words):
words = remove_non_ascii(words)
# words = to_lowercase(words)
words = remove_punctuation(words)
# words = replace_numbers(words)
# words = remove_stopwords(words)
return words
def strip_html(text):
soup = BeautifulSoup(text, "html.parser")
return soup.get_text()
def remove_between_square_brackets(text):
return re.sub('\[[^]]*\]', '', text)
def denoise_text(text):
text = strip_html(text)
text = remove_between_square_brackets(text)
return text
def replace_contractions(text):
"""Replace contractions in string of text"""
return contractions.fix(text)
def preprocess_sentence(sen):
sen = sen.strip()
# sen = re.sub(uri_re, "", sen)
#sen = sen.encode('ascii',errors='ignore')
sen = unidecode.unidecode(sen)
sen = denoise_text(sen)
# sen = replace_contractions(sen)
sen = nltk.tokenize.word_tokenize(sen)
sen = normalize([str(s) for s in sen])
# sen = normalize(unicode(sen))
# return sen
return sen
# tokenized = nltk.tokenize.word_tokenize(temp)
# final = normalize(unicode(tokenized))
# return ''.join(final)
# def NER(sen):
# doc = nlp(unicode(sen))
# return [d.ent_iob_+'-'+d.ent_type_ if d.ent_iob_ != 'O' else d.text for d in doc ], [d.text for d in doc]
def NER(sen):
doc = nlp(str(sen))
# text = doc.text
# for ent in doc.ents:
# text = text.replace(ent.text, ent.label_+'_')
tokens = [d.text for d in doc]
# [ent.merge(ent.root.tag_, ent.text, ent.label_) for ent in doc.ents]
# return compact([d.ent_iob_+'-'+d.ent_type_ if d.ent_iob_ != 'O' else d.text for d in doc ]), tokens
# return text, tokens
temp = [d.ent_type_+'_' if d.ent_iob_ != 'O' else d.text for d in doc]
return [x[0] for x in groupby(temp)], tokens
def get_split():
rand = np.random.uniform()
if rand > 0.95:
split = 'test'
# test_num += 1
elif rand > 0.91 and rand < 0.95:
split = 'val'
# val_num += 1
else:
split = 'train'
# train_num += 1
return split
if __name__ == '__main__':
np.random.seed(42)
nlp = spacy.load('en', disable=['parser', 'tagger'])
print('Loading spacy modules.')
news_data = []
counter = 0
test_num, val_num, train_num = 0, 0, 0
print('Loading the json.')
with open("../data/captioning_dataset.json", "rb") as f:
captioning_dataset = json.load(f)
for k, anns in tqdm.tqdm(captioning_dataset.items()):
for ix, img in anns['images'].items():
try:
split = get_split()
# import ipdb; ipdb.set_trace()
img = preprocess_sentence(img)
template, full = NER(' '.join(img))
if len(' '.join(template)) != 0:
news_data.append({'filename': k + '_' + ix + '.jpg', 'filepath': 'resized', 'cocoid': counter,
'imgid': k + '_' + ix, 'sentences': [], 'sentences_full': [],
# 'sentences_article':[],
'split': split})
news_data[counter]['sentences'].append(
{'imgid': counter, 'raw': ' '.join(template), 'tokens': template})
news_data[counter]['sentences_full'].append(
{'imgid': counter, 'raw': ' '.join(full), 'tokens': full})
counter += 1
except:
print(img)
split_to_ix = {i:n['split'] for i, n in enumerate(news_data)}
# train = [news_data[k] for k, v in split_to_ix.items() if v =='train']
val = [news_data[k] for k, v in split_to_ix.items() if v =='val']
test = [news_data[k] for k, v in split_to_ix.items() if v =='test']
with open("../data/test.json", "w") as f:
json.dump(test, f)
with open("../data/val.json", "w") as f:
json.dump(val, f)
with open("../data/news_dataset.json", "w") as f:
json.dump(news_data, f)