pycorrector

2019-11-01 | 阅读：次

seq2seq_attention

config.py

设置训练数据所在路径
设置结果存放路径
数据处理后训练，测试集路径
模型存储位置

pwd_path = os.path.abspath(os.path.dirname(__file__))

# Training data path.
# chinese corpus
raw_train_paths = [
    os.path.join(pwd_path, '../data/cn/CGED/CGED18_HSK_TrainingSet.xml'),
    os.path.join(pwd_path, '../data/cn/CGED/CGED17_HSK_TrainingSet.xml'),
    os.path.join(pwd_path, '../data/cn/CGED/CGED16_HSK_TrainingSet.xml'),
    # os.path.join(pwd_path, '../data/cn/CGED/sample_HSK_TrainingSet.xml'),
]

output_dir = os.path.join(pwd_path, 'output')
# Training data path.
train_path = os.path.join(output_dir, 'train.txt')
# Validation data path.
test_path = os.path.join(output_dir, 'test.txt')

result_path = os.path.join(output_dir, "result.txt")
# seq2seq_attn_train config
save_vocab_path = os.path.join(output_dir, 'vocab.txt')
attn_model_path = os.path.join(output_dir, 'attn_model.weight')

训练相关参数

batch_size = 64
epochs = 40
rnn_hidden_dim = 128
maxlen = 400
dropout = 0.0
use_gpu = False

corpus_reader

读取并且处理原始数据，存进train.txt，test.txt
train.py中调用此模块

train.py

class Seq2seqAttnModel(object):
    def __init__(self, chars, hidden_dim=128, attn_model_path=None, use_gpu=False, dropout=0.2):
        self.chars = chars
        self.hidden_dim = hidden_dim
        self.model_path = attn_model_path
        self.use_gpu = use_gpu
        self.dropout = dropout

    def build_model(self):
        # 搭建seq2seq模型
        x_in = Input(shape=(None,))
        y_in = Input(shape=(None,))
        print("x:",x_in)
        print("===============================")
        print("y:", y_in)
        x = x_in
        y = y_in
        # 为了训练的时候，方便一次性都放进去进行训练，提高效率
        x_mask = Lambda(lambda x: K.cast(K.greater(K.expand_dims(x, 2), 0), 'float32'))(x)
        y_mask = Lambda(lambda x: K.cast(K.greater(K.expand_dims(x, 2), 0), 'float32'))(y)
        print("x_mask:", x_mask)
        print("y_mask:", y_mask)

        x_one_hot = Lambda(self._one_hot)([x, x_mask])
        x_prior = ScaleShift()(x_one_hot)  # 学习输出的先验分布（target的字词很可能在input出现过）

        # embedding
        embedding = Embedding(len(self.chars), self.hidden_dim) # 其中一个是指input_dim, 另一个参数是指output_dim
        x = embedding(x)
        y = embedding(y)

        # encoder，双层双向GRU; decoder，双层单向GRU
        if self.use_gpu:
            # encoder
            x = Bidirectional(CuDNNGRU(int(self.hidden_dim / 2), return_sequences=True))(x)
            x = Bidirectional(CuDNNGRU(int(self.hidden_dim / 2), return_sequences=True))(x)
            # decoder
            y = CuDNNGRU(self.hidden_dim, return_sequences=True)(y)
            y = CuDNNGRU(self.hidden_dim, return_sequences=True)(y)
        else:
            # encoder
            x = Bidirectional(GRU(int(self.hidden_dim / 2), return_sequences=True, dropout=self.dropout))(x)
            x = Bidirectional(GRU(int(self.hidden_dim / 2), return_sequences=True, dropout=self.dropout))(x)
            # decoder
            y = GRU(self.hidden_dim, return_sequences=True, dropout=self.dropout)(y)
            y = GRU(self.hidden_dim, return_sequences=True, dropout=self.dropout)(y)

        xy = Interact()([y, x, x_mask])
        xy = Dense(512, activation='relu')(xy)
        xy = Dense(len(self.chars))(xy)
        xy = Lambda(lambda x: (x[0] + x[1]) / 2)([xy, x_prior])  # 与先验结果平均
        xy = Activation('softmax')(xy)

        # 交叉熵作为loss，但mask掉padding部分
        cross_entropy = K.sparse_categorical_crossentropy(y_in[:, 1:], xy[:, :-1])
        loss = K.sum(cross_entropy * y_mask[:, 1:, 0]) / K.sum(y_mask[:, 1:, 0])

        model = Model([x_in, y_in], xy)
        model.add_loss(loss)
        model.compile(optimizer=Adam(1e-3))
        if os.path.exists(self.model_path):
            model.load_weights(self.model_path)
        return model

    def _one_hot(self, x):
        """
        输出 one hot 向量
        :param x:
        :return:
        """
        x, x_mask = x
        x = K.cast(x, 'int32')
        x = K.one_hot(x, len(self.chars))
        x = K.sum(x_mask * x, 1, keepdims=True)
        x = K.cast(K.greater(x, 0.5), 'float32')
        return x

infer.py

出现问题

运行train.py时

Using TensorFlow backend.
Read data, path:/Users/stone/PycharmProjects/pycorrector/pycorrector/seq2seq_attention/output/train.txt
Read data, path:/Users/stone/PycharmProjects/pycorrector/pycorrector/seq2seq_attention/output/test.txt
WARNING:tensorflow:From /Users/stone/anaconda3/envs/tensorflow_36/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py:263: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.
WARNING:tensorflow:From /Users/stone/anaconda3/envs/tensorflow_36/lib/python3.6/site-packages/tensorflow/python/ops/math_ops.py:3066: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.
WARNING:tensorflow:From /Users/stone/anaconda3/envs/tensorflow_36/lib/python3.6/site-packages/tensorflow/python/ops/math_grad.py:102: div (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Epoch 1/40
2019-11-01 21:45:20.999811: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: SSE4.1 SSE4.2 AVX AVX2 FMA
2019-11-01 21:45:21.000020: I tensorflow/core/common_runtime/process_util.cc:71] Creating new thread pool with default inter op setting: 8. Tune using inter_op_parallelism_threads for best performance.
OMP: Error #15: Initializing libiomp5.dylib, but found libiomp5.dylib already initialized.
OMP: Hint: This means that multiple copies of the OpenMP runtime have been linked into the program. That is dangerous, since it can degrade performance or cause incorrect results. The best thing to do is to ensure that only a single OpenMP runtime is linked into the process, e.g. by avoiding static linking of the OpenMP runtime in any library. As an unsafe, unsupported, undocumented workaround you can set the environment variable KMP_DUPLICATE_LIB_OK=TRUE to allow the program to continue to execute, but that may cause crashes or silently produce incorrect results. For more information, please see http://www.intel.com/software/products/support/.

解决方法

https://github.com/dmlc/xgboost/issues/1715: 此方法未能解决问题
解决：https://qiita.com/161abcd/items/6ddf76366bc30c79522f
```
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
```