##Using LSTM Cells in a recurrent neural network, this will generate chatbot profiles for each primary southpark character

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

df = pd.read_csv("../input/All-seasons.csv")
lines = df["Line"]
characters = df["Character"]
episodes = df["Episode"]
charlines = "("+characters+") " + lines
text = ""
for line in charlines:
    text += line
token_dict = { 
    '!': '||EXCLAIMATIONMARK||',
    '?': '||QUESTIONMARK||',
    '--': '||DOUBLEDASH||',
    '"': '||DOUBLEQUOTE||',
    ',': '||COMMA||',
    '.': '||PERIOD||',
    ';': '||SEMICOLON||',
    '\n': '||NEWLINE||',
    '(': '||OPENPAREN||',
    ')': '||CLOSEPAREN||',
    #'+': '||PLUS||',
    '&': '||AMPERSAND||',
    ':': '||COLON||',
    #'\'': '||APOSTROPHE||',
    #'-': '||DASH||',

for key,token in token_dict.items() :
    text = text.replace(key, ' {} '.format(token)) 
text = text.lower().split()
vocab = set(text)
vocab_to_int = {w:i for i,w in enumerate(vocab)}
int_to_vocab = {i:w for w,i in vocab_to_int.items()}
int_text = [vocab_to_int[word] for word in text]

Save preprocessed data to file

pickle.dump((int_text, vocab_to_int, int_to_vocab, token_dict), open('preprocess.p', 'wb'))


int_text, vocab_to_int, int_to_vocab, token_dict = pickle.load(open('preprocess.p', mode='rb'))