├── README.md // 本文件
├── dataset // 数据集目录
│ ├── test.txt // 原测试集
│ ├── test_text.txt // 预处理后只包含文本的测试集
│ ├── train.txt // 原训练集
│ ├── train_label.txt // 预处理后只包含标签的训练集
│ └── train_text.txt // 预处理后只包含文本的训练集
├── output
│ └── 周恩帅-PB17111561-X.txt // 测试集上的若干次预测结果
├── res // 一些预训练模型文件
│ ├── exp2.pdf
│ ├── glove.6B.100d.txt
│ ├── glove.6B.200d.txt
│ ├── glove.6B.300d.txt
│ ├── glove.6B.50d.txt
│ └── 实验二在线评测说明.pdf
├── src
│ ├── re-bert-lstm-conv.ipynb // albert+lstm+cnn 模型
│ ├── re-bert-lstm.ipynb // albert+lstm模型
│ ├── re-bert-lstm.py // albert+lstm模型
│ ├── re-fasttext.ipynb // fasttext+lstm模型
│ ├── re-glove.ipynb // glove+lstm模型
│ ├── stat-feature.ipynb // 统计样本的一些特征,如句子长度、关系词之间的距离
│ └── util.py // 预处理、数据集读取、写入函数
└── 实验报告.pdf
软硬件环境:
-
Linux
-
Python3.6 with numpy,tensorflow,tensorflow_hub,tensorflow_text
-
ALBERT预训练模型
-
Nvidia Tesla V100
运行方式:
-
预处理数据集:
python3 util.py
-
训练模型,训练完成后同时输出在测试集上的预测结果:
python3 re-bert-lstm.py
从tf hub上加载预训练的ALBERT模型,将句子编码成序列化的向量
preprocessor = hub.load(
"https://hub.tensorflow.google.cn/tensorflow/albert_en_preprocess/2"
)
text_inputs = [tf.keras.layers.Input(shape=(), dtype=tf.string)]
tokenize = hub.KerasLayer(preprocessor.tokenize)
tokenized_inputs = [tokenize(segment) for segment in text_inputs]
bert_pack_inputs = hub.KerasLayer(
preprocessor.bert_pack_inputs, arguments=dict(seq_length=seq_length)
)
encoder_inputs = bert_pack_inputs(tokenized_inputs)
encoder = hub.KerasLayer(
"https://hub.tensorflow.google.cn/tensorflow/albert_en_base/2", trainable=False
)
outputs = encoder(encoder_inputs)
pooled_output = outputs["pooled_output"]
sequence_output = outputs["sequence_output"]
使用tensorflow搭建LSTM分类网络并训练
x = sequence_output
x = layers.Bidirectional(layers.LSTM(lstm_nodes, recurrent_dropout=0.2, dropout=0.2))(x)
outputs = layers.Dense(len(relations), activation="softmax")(x)
model = tf.keras.Model(inputs=text_inputs, outputs=outputs)
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
history = model.fit(X, Y, epochs=epochs, batch_size=32, validation_split=valid_ratio)
在得到最好的网络之前,我们还尝试了CNN,分为卷积、池化、合并几个步骤
x = sequence_output
# x = layers.Bidirectional(layers.LSTM(256, return_sequences=True, recurrent_dropout=0.2, dropout=0.2))(x)
x = layers.Reshape((128, -1, 1))(x)
x2 = layers.Conv2D(1, (2, x.shape[2]), activation='relu')(x)
x3 = layers.Conv2D(1, (3, x.shape[2]), activation='relu')(x)
x4 = layers.Conv2D(1, (4, x.shape[2]), activation='relu')(x)
x5 = layers.Conv2D(1, (5, x.shape[2]), activation='relu')(x)
x6 = layers.Conv2D(1, (6, x.shape[2]), activation='relu')(x)
# x7 = layers.Conv2D(1, (7, x.shape[2]), activation='relu')(x)
# x8 = layers.Conv2D(1, (8, x.shape[2]), activation='relu')(x)
# x16 = layers.Conv2D(1, (16, x.shape[2]), activation='relu')(x)
# x32 = layers.Conv2D(1, (32, x.shape[2]), activation='relu')(x)
# x64 = layers.Conv2D(1, (64, x.shape[2]), activation='relu')(x)
x2 = layers.MaxPooling2D(pool_size=(x2.shape[1], 1), padding='valid')(x2)
x3 = layers.MaxPooling2D(pool_size=(x3.shape[1], 1), padding='valid')(x3)
x4 = layers.MaxPooling2D(pool_size=(x4.shape[1], 1), padding='valid')(x4)
x5 = layers.MaxPooling2D(pool_size=(x5.shape[1], 1), padding='valid')(x5)
x6 = layers.MaxPooling2D(pool_size=(x6.shape[1], 1), padding='valid')(x6)
# x7 = layers.MaxPooling2D(pool_size=(x7.shape[1], 1), padding='valid')(x7)
# x8 = layers.MaxPooling2D(pool_size=(x8 .shape[1], 1), padding='valid')(x8 )
# x16 = layers.MaxPooling2D(pool_size=(x16.shape[1], 1), padding='valid')(x16)
# x32 = layers.MaxPooling2D(pool_size=(x32.shape[1], 1), padding='valid')(x32)
# x64 = layers.MaxPooling2D(pool_size=(x64.shape[1], 1), padding='valid')(x64)
x2 = layers.Reshape((-1,))(x2)
x3 = layers.Reshape((-1,))(x3)
x4 = layers.Reshape((-1,))(x4)
x5 = layers.Reshape((-1,))(x5)
x6 = layers.Reshape((-1,))(x6)
# x7 = layers.Reshape((-1,))(x7)
# x8 = layers.Reshape((-1,))(x8 )
# x16 = layers.Reshape((-1,))(x16)
# x32 = layers.Reshape((-1,))(x32)
# x64 = layers.Reshape((-1,))(x64)
# [x2, x3, x4, x5, x6, x7, x8, x16, x32, x64]
x = layers.Concatenate(axis=1)([x2, x3, x4, x5, x6])
outputs = layers.Dense(len(relations), activation='softmax')(x)
model = tf.keras.Model(inputs=text_inputs, outputs=outputs)
在使用BERT之前,我们尝试了Glove等预训练词向量:
glove_dir = os.path.join("..", "res", "")
embedding_dim = 300
embeddings_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.%dd.txt' % embedding_dim), encoding="utf-8")
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
f.close()
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
embedding_vector = embeddings_index.get(word)
if i < max_words:
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
model = tf.keras.models.Sequential()
model.add(layers.Embedding(max_words, embedding_dim, mask_zero=True))
model.add(layers.Bidirectional(layers.LSTM(128, go_backwards=True,recurrent_dropout=0.2, dropout=0.2)))
model.add(layers.Dense(len(relations), activation='softmax'))
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False