ThilinaRajapakse/BERT_binary_text_classification

Error when running the converter.py

abdallah197 opened this issue · 6 comments

i get the following error when running the converter.py file
python3 converter.py File "converter.py", line 84 with open("train_features', 'wb')" as f: ^ SyntaxError: invalid syntax

my converter.py file:

'''
If you are having trouble multiprocessing inside Notebooks, give this script a shot.
'''

import torch
import pickle
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
from torch.nn import CrossEntropyLoss, MSELoss

from tqdm import tqdm, trange
import os
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM, BertForSequenceClassification
from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule

from multiprocessing import Pool, cpu_count
from tools import *
import convert_examples_to_features

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
logging.basicConfig(level=logging.INFO)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# The input data dir. Should contain the .tsv files (or other data files) for the task.
DATA_DIR = "output/bert"

# Bert pre-trained model selected in the list: bert-base-uncased, 
# bert-large-uncased, bert-base-cased, bert-large-cased,0 bert-base-multilingual-uncased,
# bert-base-multilingual-cased, bert-base-chinese.
BERT_MODEL = 'bert-base-multilingual-cased'

# The name of the task to train.I'm going to name this 'yelp'.
TASK_NAME = 'etp'

# The output directory where the fine-tuned model and checkpoints will be written.
OUTPUT_DIR = f'{DATA_DIR}/{TASK_NAME}/'

# The directory where the evaluation reports will be written to.
REPORTS_DIR = f'{DATA_DIR}/reports/{TASK_NAME}_evaluation_report/'

# This is where BERT will look for pre-trained models to load parameters from.
CACHE_DIR = 'cache/'

# The maximum total input sequence length after WordPiece tokenization.
# Sequences longer than this will be truncated, and sequences shorter than this will be padded.
MAX_SEQ_LENGTH = 250

TRAIN_BATCH_SIZE = 24
EVAL_BATCH_SIZE = 32
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 1
RANDOM_SEED = 42
GRADIENT_ACCUMULATION_STEPS = 1
WARMUP_PROPORTION = 0.1
OUTPUT_MODE = 'classification'

CONFIG_NAME = "config.json"
WEIGHTS_NAME = "pytorch_model.bin"

output_mode = OUTPUT_MODE

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)

processor = ColaProcessor()
train_examples = processor.get_train_examples(DATA_DIR)
train_examples_len = len(train_examples)

label_list = processor.get_labels() # [0, 1] for binary classification
num_labels = len(label_list)

label_map = {label: i for i, label in enumerate(label_list)}
train_examples_for_processing = [(example, label_map, MAX_SEQ_LENGTH, tokenizer, output_mode) for example in train_examples]

process_count = 1
if __name__ ==  '__main__':
    print(f'Preparing to convert {train_examples_len} examples..')
    print(f'Spawning {process_count} processes..')
    with Pool(process_count) as p:
        train_features = list(tqdm(p.imap(convert_examples_to_features.convert_example_to_feature, train_examples_for_processing), total=train_examples_len))

with open("train_features', 'wb') as f:
          pickle.dump(train_features, f)

The indentation in the script looks incorrect.

Try amending the with open("train_features', 'wb') as f: so that it's inside the if statement:

if __name__ ==  '__main__':
    print(f'Preparing to convert {train_examples_len} examples..')
    print(f'Spawning {process_count} processes..')
    with Pool(process_count) as p:
        train_features = list(tqdm(p.imap(convert_examples_to_features.convert_example_to_feature, train_examples_for_processing), total=train_examples_len))

    with open("train_features', 'wb') as f:
          pickle.dump(train_features, f)

I have tried that but it shows this error
SyntaxError: EOL while scanning string literal
I have changed the double quotes in the with open("train_features', 'wb') as f:
but it throws another error

processor = ColaProcessor() NameError: name 'ColaProcessor' is not defined

I think that's a mistake and should be processor = BinaryClassificationProcessor()

And if you want to be using multiprocessing, process_count = 1 should be:

process_count = cpu_count() - 1

It returns the same error as when I run it in the notebook

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "converter.py", line 80, in <module>
    train_examples_for_processing), total=train_examples_len))
  File "/home/ab/anaconda3/envs/etp/lib/python3.7/site-packages/tqdm/_tqdm.py", line 1022, in __iter__
    for obj in iterable:
  File "/home/ab/anaconda3/envs/etp/lib/python3.7/multiprocessing/pool.py", line 748, in next
    raise value
KeyError: '4'

my whole converter.py file is

import torch
import pickle
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
from torch.nn import CrossEntropyLoss, MSELoss

from tqdm import tqdm, trange
import os
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM, BertForSequenceClassification
from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule

from multiprocessing import Pool, cpu_count
from tools import *
import convert_examples_to_features

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
logging.basicConfig(level=logging.INFO)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# The input data dir. Should contain the .tsv files (or other data files) for the task.
DATA_DIR = "output/bert"

# Bert pre-trained model selected in the list: bert-base-uncased, 
# bert-large-uncased, bert-base-cased, bert-large-cased,0 bert-base-multilingual-uncased,
# bert-base-multilingual-cased, bert-base-chinese.
BERT_MODEL = 'bert-base-multilingual-cased'

# The name of the task to train.I'm going to name this 'yelp'.
TASK_NAME = 'etp'

# The output directory where the fine-tuned model and checkpoints will be written.
OUTPUT_DIR = f'{DATA_DIR}/{TASK_NAME}/'


CACHE_DIR = 'cache/'

# The maximum total input sequence length after WordPiece tokenization.
# Sequences longer than this will be truncated, and sequences shorter than this will be padded.
MAX_SEQ_LENGTH = 250

TRAIN_BATCH_SIZE = 24
EVAL_BATCH_SIZE = 32
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 1
RANDOM_SEED = 42
GRADIENT_ACCUMULATION_STEPS = 1
WARMUP_PROPORTION = 0.1
OUTPUT_MODE = 'classification'

CONFIG_NAME = "config.json"
WEIGHTS_NAME = "pytorch_model.bin"

output_mode = OUTPUT_MODE

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

processor = BinaryClassificationProcessor()
train_examples = processor.get_train_examples(DATA_DIR)
train_examples_len = len(train_examples)

label_list = processor.get_labels() # [0, 1] for binary classification
num_labels = len(label_list)

label_map = {label: i for i, label in enumerate(label_list)}
train_examples_for_processing = [(example, label_map, MAX_SEQ_LENGTH, tokenizer, output_mode) for example in train_examples]

process_count = cpu_count() - 1
if __name__ ==  '__main__':
    print(f'Preparing to convert {train_examples_len} examples..')
    print(f'Spawning {process_count} processes..')
    with Pool(process_count) as p:
        train_features = list(tqdm(p.imap(convert_examples_to_features.convert_example_to_feature,
                                          train_examples_for_processing), total=train_examples_len))

    with open(train_features, 'wb') as f:
          pickle.dump(train_features, f)

I suspect it is something related to the multiprocessing which i could not fix earlier when i was using the notebook. how could i extract the features without using the multiprocessor

I pushed a fix to converter.py. There were a few bugs in it.

  1. Pickle wasn't imported.
  2. Double quote used to start file name in train_features but single quote used to close it.
  3. ColaProcessor was imported when it should be BinaryClassificationProcessor.

Also, check whether you have tqdm installed in your virtual environment. Your error seems to be related to that. Can you run it now and tell me if it works?