Processing Longer Texts
Closed this issue · 6 comments
Hi! Love the library!
I am trying to find a way to process longer news articles/texts. I am getting TooLongTextException. I try to split it into chunks, but then I get an UnboundLocalError: cannot access local variable 'text' where it is not associated with a value.
Hi, thanks! To be able to look into this, please provide a minimal example. Sounds like at least once the variable text is empty
Hi, thank you so much for getting back to me so quick! Here is a part of my code that's giving the problem. It shows that some of the batches get processed, but does not provide an output and gives the UnboundLocalError.
def process_text(texts):
# Split text into manageable chunks
max_chunk_size = 512 # Set to a suitable size for your model
text_chunks = [texts[i:i + max_chunk_size] for i in range(0, len(texts), max_chunk_size)]
merged_entities = []
for chunk in text_chunks:
# Perform NER on each chunk
ner_spans = nlp(chunk)
# Process each merged entity and calculate sentiment
for entity in merged_entities:
# print(text)
l = texts[:entity['start']]
m = texts[entity['start']:entity['end']]
r = texts[entity['end']:]
sentiment = tsc.infer_from_text(l, m, r)
entity_name = entity['word'] ```
This is my entire code:
# Function to find coreference matches
def coreference_match(entity, existing_entities):
entity_parts = set(entity.split())
for ex_entity in existing_entities:
ex_parts = set(ex_entity.split())
# Check if both sets have the same parts
if entity_parts == ex_parts:
return ex_entity
# Check if the shorter entity is a subset of the longer entity
if len(entity_parts) < len(ex_parts) and entity_parts.issubset(ex_parts):
return ex_entity
elif len(entity_parts) > len(ex_parts) and ex_parts.issubset(entity_parts):
return ex_entity
return None
# Function to process a single text entry
def process_text(texts):
# Split text into manageable chunks
max_chunk_size = 512 # Set to a suitable size for your model
text_chunks = [texts[i:i + max_chunk_size] for i in range(0, len(texts), max_chunk_size)]
merged_entities = []
for chunk in text_chunks:
# Perform NER on each chunk
ner_spans = nlp(chunk)
# Merge beginning and inside tokens for persons (PER) and organizations (ORG), and filter them
current_entity = None
for span in ner_spans:
entity_label = span['entity'][2:] # Remove 'B-' or 'I-' prefix to get the entity type
if entity_label in ['PER', 'ORG']:
if span['entity'].startswith('B-'):
if current_entity:
merged_entities.append(current_entity)
current_entity = {'entity': entity_label, 'start': span['start'], 'end': span['end'], 'word': span['word']}
elif span['entity'].startswith('I-') and current_entity and current_entity['entity'] == entity_label:
current_entity['end'] = span['end']
current_entity['word'] += span['word'][2:] if span['word'].startswith("##") else " " + span['word']
if current_entity:
merged_entities.append(current_entity)
# Extract the words of the merged entities
ents = [entity["word"] for entity in merged_entities]
print(f"Entities: {ents}")
# Dictionary to hold sentiments for each entity
entity_sentiments = defaultdict(lambda: {'type': None, 'sentiments': []})
# Process each merged entity and calculate sentiment
for entity in merged_entities:
# print(text)
l = texts[:entity['start']]
m = texts[entity['start']:entity['end']]
r = texts[entity['end']:]
sentiment = tsc.infer_from_text(l, m, r)
entity_name = entity['word']
# Check for coreference and merge if needed
match = coreference_match(entity_name, entity_sentiments)
if match:
entity_name = match
entity_sentiments[entity_name]['type'] = entity['entity']
entity_sentiments[entity_name]['sentiments'].append(sentiment[0]['class_label'])
# print(entity_sentiments[entity_name]['sentiments'])
# Combine sentiments for entities that are the same
combined_sentiments = {}
for entity, data in entity_sentiments.items():
sentiments = data['sentiments']
entity_type = data['type']
# If there is any negative sentiment, the combined sentiment is negative
if 'negative' in sentiments:
combined_sentiments[entity] = {'type': entity_type, 'sentiment': 'negative'}
elif 'positive' in sentiments:
combined_sentiments[entity] = {'type': entity_type, 'sentiment': 'positive'}
else:
combined_sentiments[entity] = {'type': entity_type, 'sentiment': 'neutral'}
# Print entities separately
print("Extracted Entities:")
for entity in entity_sentiments.keys():
print(entity)
# Print combined sentiments
print("\nEntities and their sentiments:")
for entity, data in combined_sentiments.items():
print(f"Entity: {entity}, Type: {data['type']}, Combined Sentiment: {data['sentiment']}")
# Process the sample text
process_text(TEXT)
Sorry but thats not a minimal example to reproduce the issue but it's just your project's code. I would debug through an actual minimal example, but unfortunately won't be able to debug your project's code.
I'm sorry. I was actually able to resolve the issue. Thank you so much again for your prompt reply and readiness to help.