mirror of https://github.com/fxsjy/jieba.git
加入paddle分词和词性标注功能 (#788)
* paddle cut release * 修改README.md,提示用户安装paddlepaddle.tiny * 删除两个init.py文件中utf头文件 * 修改readme细节pull/789/head
parent
38134ee20f
commit
5b3bb4b7f2
@ -0,0 +1,46 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
Define the function to create lexical analysis model and model's data reader
|
||||
"""
|
||||
import sys
|
||||
import os
|
||||
import math
|
||||
|
||||
import paddle
|
||||
import paddle.fluid as fluid
|
||||
from paddle.fluid.initializer import NormalInitializer
|
||||
import jieba.lac_small.nets as nets
|
||||
|
||||
|
||||
def create_model(vocab_size, num_labels, mode='train'):
|
||||
"""create lac model"""
|
||||
|
||||
# model's input data
|
||||
words = fluid.data(name='words', shape=[-1, 1], dtype='int64', lod_level=1)
|
||||
targets = fluid.data(
|
||||
name='targets', shape=[-1, 1], dtype='int64', lod_level=1)
|
||||
|
||||
# for inference process
|
||||
if mode == 'infer':
|
||||
crf_decode = nets.lex_net(
|
||||
words, vocab_size, num_labels, for_infer=True, target=None)
|
||||
return {
|
||||
"feed_list": [words],
|
||||
"words": words,
|
||||
"crf_decode": crf_decode,
|
||||
}
|
||||
return ret
|
||||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,122 @@
|
||||
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
The function lex_net(args) define the lexical analysis network structure
|
||||
"""
|
||||
import sys
|
||||
import os
|
||||
import math
|
||||
|
||||
import paddle.fluid as fluid
|
||||
from paddle.fluid.initializer import NormalInitializer
|
||||
|
||||
|
||||
def lex_net(word, vocab_size, num_labels, for_infer=True, target=None):
|
||||
"""
|
||||
define the lexical analysis network structure
|
||||
word: stores the input of the model
|
||||
for_infer: a boolean value, indicating if the model to be created is for training or predicting.
|
||||
|
||||
return:
|
||||
for infer: return the prediction
|
||||
otherwise: return the prediction
|
||||
"""
|
||||
|
||||
word_emb_dim=128
|
||||
grnn_hidden_dim=128
|
||||
bigru_num=2
|
||||
emb_lr = 1.0
|
||||
crf_lr = 1.0
|
||||
init_bound = 0.1
|
||||
IS_SPARSE = True
|
||||
|
||||
def _bigru_layer(input_feature):
|
||||
"""
|
||||
define the bidirectional gru layer
|
||||
"""
|
||||
pre_gru = fluid.layers.fc(
|
||||
input=input_feature,
|
||||
size=grnn_hidden_dim * 3,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.Uniform(
|
||||
low=-init_bound, high=init_bound),
|
||||
regularizer=fluid.regularizer.L2DecayRegularizer(
|
||||
regularization_coeff=1e-4)))
|
||||
gru = fluid.layers.dynamic_gru(
|
||||
input=pre_gru,
|
||||
size=grnn_hidden_dim,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.Uniform(
|
||||
low=-init_bound, high=init_bound),
|
||||
regularizer=fluid.regularizer.L2DecayRegularizer(
|
||||
regularization_coeff=1e-4)))
|
||||
|
||||
pre_gru_r = fluid.layers.fc(
|
||||
input=input_feature,
|
||||
size=grnn_hidden_dim * 3,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.Uniform(
|
||||
low=-init_bound, high=init_bound),
|
||||
regularizer=fluid.regularizer.L2DecayRegularizer(
|
||||
regularization_coeff=1e-4)))
|
||||
gru_r = fluid.layers.dynamic_gru(
|
||||
input=pre_gru_r,
|
||||
size=grnn_hidden_dim,
|
||||
is_reverse=True,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.Uniform(
|
||||
low=-init_bound, high=init_bound),
|
||||
regularizer=fluid.regularizer.L2DecayRegularizer(
|
||||
regularization_coeff=1e-4)))
|
||||
|
||||
bi_merge = fluid.layers.concat(input=[gru, gru_r], axis=1)
|
||||
return bi_merge
|
||||
|
||||
def _net_conf(word, target=None):
|
||||
"""
|
||||
Configure the network
|
||||
"""
|
||||
word_embedding = fluid.embedding(
|
||||
input=word,
|
||||
size=[vocab_size, word_emb_dim],
|
||||
dtype='float32',
|
||||
is_sparse=IS_SPARSE,
|
||||
param_attr=fluid.ParamAttr(
|
||||
learning_rate=emb_lr,
|
||||
name="word_emb",
|
||||
initializer=fluid.initializer.Uniform(
|
||||
low=-init_bound, high=init_bound)))
|
||||
|
||||
input_feature = word_embedding
|
||||
for i in range(bigru_num):
|
||||
bigru_output = _bigru_layer(input_feature)
|
||||
input_feature = bigru_output
|
||||
|
||||
emission = fluid.layers.fc(
|
||||
size=num_labels,
|
||||
input=bigru_output,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.Uniform(
|
||||
low=-init_bound, high=init_bound),
|
||||
regularizer=fluid.regularizer.L2DecayRegularizer(
|
||||
regularization_coeff=1e-4)))
|
||||
|
||||
size = emission.shape[1]
|
||||
fluid.layers.create_parameter(
|
||||
shape=[size + 2, size], dtype=emission.dtype, name='crfw')
|
||||
crf_decode = fluid.layers.crf_decoding(
|
||||
input=emission, param_attr=fluid.ParamAttr(name='crfw'))
|
||||
|
||||
return crf_decode
|
||||
return _net_conf(word)
|
@ -0,0 +1,82 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import time
|
||||
import sys
|
||||
|
||||
import paddle.fluid as fluid
|
||||
import paddle
|
||||
|
||||
import jieba.lac_small.utils as utils
|
||||
import jieba.lac_small.creator as creator
|
||||
import jieba.lac_small.reader_small as reader_small
|
||||
import numpy
|
||||
|
||||
word_emb_dim=128
|
||||
grnn_hidden_dim=128
|
||||
bigru_num=2
|
||||
use_cuda=False
|
||||
basepath = os.path.abspath(__file__)
|
||||
folder = os.path.dirname(basepath)
|
||||
init_checkpoint = os.path.join(folder, "model_baseline")
|
||||
batch_size=1
|
||||
|
||||
dataset = reader_small.Dataset()
|
||||
infer_program = fluid.Program()
|
||||
with fluid.program_guard(infer_program, fluid.default_startup_program()):
|
||||
with fluid.unique_name.guard():
|
||||
infer_ret = creator.create_model(dataset.vocab_size, dataset.num_labels, mode='infer')
|
||||
infer_program = infer_program.clone(for_test=True)
|
||||
place = fluid.CPUPlace()
|
||||
exe = fluid.Executor(place)
|
||||
exe.run(fluid.default_startup_program())
|
||||
utils.init_checkpoint(exe, init_checkpoint, infer_program)
|
||||
results = []
|
||||
|
||||
def get_sent(str1):
|
||||
feed_data=dataset.get_vars(str1)
|
||||
a = numpy.array(feed_data).astype(numpy.int64)
|
||||
a=a.reshape(-1,1)
|
||||
c = fluid.create_lod_tensor(a, [[a.shape[0]]], place)
|
||||
|
||||
words, crf_decode = exe.run(
|
||||
infer_program,
|
||||
fetch_list=[infer_ret['words'], infer_ret['crf_decode']],
|
||||
feed={"words":c, },
|
||||
return_numpy=False,
|
||||
use_program_cache=True)
|
||||
sents=[]
|
||||
sent,tag = utils.parse_result(words, crf_decode, dataset)
|
||||
sents = sents + sent
|
||||
return sents
|
||||
|
||||
def get_result(str1):
|
||||
feed_data=dataset.get_vars(str1)
|
||||
a = numpy.array(feed_data).astype(numpy.int64)
|
||||
a=a.reshape(-1,1)
|
||||
c = fluid.create_lod_tensor(a, [[a.shape[0]]], place)
|
||||
|
||||
words, crf_decode = exe.run(
|
||||
infer_program,
|
||||
fetch_list=[infer_ret['words'], infer_ret['crf_decode']],
|
||||
feed={"words":c, },
|
||||
return_numpy=False,
|
||||
use_program_cache=True)
|
||||
results=[]
|
||||
results += utils.parse_result(words, crf_decode, dataset)
|
||||
return results
|
@ -0,0 +1,100 @@
|
||||
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
The file_reader converts raw corpus to input.
|
||||
"""
|
||||
|
||||
import os
|
||||
import __future__
|
||||
import io
|
||||
import paddle
|
||||
import paddle.fluid as fluid
|
||||
|
||||
def load_kv_dict(dict_path,
|
||||
reverse=False,
|
||||
delimiter="\t",
|
||||
key_func=None,
|
||||
value_func=None):
|
||||
"""
|
||||
Load key-value dict from file
|
||||
"""
|
||||
result_dict = {}
|
||||
for line in io.open(dict_path, "r", encoding='utf8'):
|
||||
terms = line.strip("\n").split(delimiter)
|
||||
if len(terms) != 2:
|
||||
continue
|
||||
if reverse:
|
||||
value, key = terms
|
||||
else:
|
||||
key, value = terms
|
||||
if key in result_dict:
|
||||
raise KeyError("key duplicated with [%s]" % (key))
|
||||
if key_func:
|
||||
key = key_func(key)
|
||||
if value_func:
|
||||
value = value_func(value)
|
||||
result_dict[key] = value
|
||||
return result_dict
|
||||
|
||||
class Dataset(object):
|
||||
"""data reader"""
|
||||
def __init__(self):
|
||||
# read dict
|
||||
basepath = os.path.abspath(__file__)
|
||||
folder = os.path.dirname(basepath)
|
||||
word_dict_path = os.path.join(folder, "word.dic")
|
||||
label_dict_path = os.path.join(folder, "tag.dic")
|
||||
self.word2id_dict = load_kv_dict(
|
||||
word_dict_path, reverse=True, value_func=int)
|
||||
self.id2word_dict = load_kv_dict(word_dict_path)
|
||||
self.label2id_dict = load_kv_dict(
|
||||
label_dict_path, reverse=True, value_func=int)
|
||||
self.id2label_dict = load_kv_dict(label_dict_path)
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
"""vocabuary size"""
|
||||
return max(self.word2id_dict.values()) + 1
|
||||
|
||||
@property
|
||||
def num_labels(self):
|
||||
"""num_labels"""
|
||||
return max(self.label2id_dict.values()) + 1
|
||||
|
||||
def word_to_ids(self, words):
|
||||
"""convert word to word index"""
|
||||
word_ids = []
|
||||
for word in words:
|
||||
if word not in self.word2id_dict:
|
||||
word = "OOV"
|
||||
word_id = self.word2id_dict[word]
|
||||
word_ids.append(word_id)
|
||||
return word_ids
|
||||
|
||||
def label_to_ids(self, labels):
|
||||
"""convert label to label index"""
|
||||
label_ids = []
|
||||
for label in labels:
|
||||
if label not in self.label2id_dict:
|
||||
label = "O"
|
||||
label_id = self.label2id_dict[label]
|
||||
label_ids.append(label_id)
|
||||
return label_ids
|
||||
|
||||
def get_vars(self,str1):
|
||||
words = str1.strip()
|
||||
word_ids = self.word_to_ids(words)
|
||||
return word_ids
|
||||
|
||||
|
@ -0,0 +1,57 @@
|
||||
0 a-B
|
||||
1 a-I
|
||||
2 ad-B
|
||||
3 ad-I
|
||||
4 an-B
|
||||
5 an-I
|
||||
6 c-B
|
||||
7 c-I
|
||||
8 d-B
|
||||
9 d-I
|
||||
10 f-B
|
||||
11 f-I
|
||||
12 m-B
|
||||
13 m-I
|
||||
14 n-B
|
||||
15 n-I
|
||||
16 nr-B
|
||||
17 nr-I
|
||||
18 ns-B
|
||||
19 ns-I
|
||||
20 nt-B
|
||||
21 nt-I
|
||||
22 nw-B
|
||||
23 nw-I
|
||||
24 nz-B
|
||||
25 nz-I
|
||||
26 p-B
|
||||
27 p-I
|
||||
28 q-B
|
||||
29 q-I
|
||||
30 r-B
|
||||
31 r-I
|
||||
32 s-B
|
||||
33 s-I
|
||||
34 t-B
|
||||
35 t-I
|
||||
36 u-B
|
||||
37 u-I
|
||||
38 v-B
|
||||
39 v-I
|
||||
40 vd-B
|
||||
41 vd-I
|
||||
42 vn-B
|
||||
43 vn-I
|
||||
44 w-B
|
||||
45 w-I
|
||||
46 xc-B
|
||||
47 xc-I
|
||||
48 PER-B
|
||||
49 PER-I
|
||||
50 LOC-B
|
||||
51 LOC-I
|
||||
52 ORG-B
|
||||
53 ORG-I
|
||||
54 TIME-B
|
||||
55 TIME-I
|
||||
56 O
|
@ -0,0 +1,142 @@
|
||||
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
util tools
|
||||
"""
|
||||
from __future__ import print_function
|
||||
import os
|
||||
import sys
|
||||
import numpy as np
|
||||
import paddle.fluid as fluid
|
||||
import io
|
||||
|
||||
|
||||
def str2bool(v):
|
||||
"""
|
||||
argparse does not support True or False in python
|
||||
"""
|
||||
return v.lower() in ("true", "t", "1")
|
||||
|
||||
|
||||
|
||||
def parse_result(words, crf_decode, dataset):
|
||||
""" parse result """
|
||||
offset_list = (crf_decode.lod())[0]
|
||||
words = np.array(words)
|
||||
crf_decode = np.array(crf_decode)
|
||||
batch_size = len(offset_list) - 1
|
||||
|
||||
for sent_index in range(batch_size):
|
||||
begin, end = offset_list[sent_index], offset_list[sent_index + 1]
|
||||
sent=[]
|
||||
for id in words[begin:end]:
|
||||
if dataset.id2word_dict[str(id[0])]=='OOV':
|
||||
sent.append(' ')
|
||||
else:
|
||||
sent.append(dataset.id2word_dict[str(id[0])])
|
||||
tags = [
|
||||
dataset.id2label_dict[str(id[0])] for id in crf_decode[begin:end]
|
||||
]
|
||||
|
||||
sent_out = []
|
||||
tags_out = []
|
||||
parital_word = ""
|
||||
for ind, tag in enumerate(tags):
|
||||
# for the first word
|
||||
if parital_word == "":
|
||||
parital_word = sent[ind]
|
||||
tags_out.append(tag.split('-')[0])
|
||||
continue
|
||||
|
||||
# for the beginning of word
|
||||
if tag.endswith("-B") or (tag == "O" and tags[ind - 1] != "O"):
|
||||
sent_out.append(parital_word)
|
||||
tags_out.append(tag.split('-')[0])
|
||||
parital_word = sent[ind]
|
||||
continue
|
||||
|
||||
parital_word += sent[ind]
|
||||
|
||||
# append the last word, except for len(tags)=0
|
||||
if len(sent_out) < len(tags_out):
|
||||
sent_out.append(parital_word)
|
||||
return sent_out,tags_out
|
||||
|
||||
def parse_padding_result(words, crf_decode, seq_lens, dataset):
|
||||
""" parse padding result """
|
||||
words = np.squeeze(words)
|
||||
batch_size = len(seq_lens)
|
||||
|
||||
batch_out = []
|
||||
for sent_index in range(batch_size):
|
||||
|
||||
sent=[]
|
||||
for id in words[begin:end]:
|
||||
if dataset.id2word_dict[str(id[0])]=='OOV':
|
||||
sent.append(' ')
|
||||
else:
|
||||
sent.append(dataset.id2word_dict[str(id[0])])
|
||||
tags = [
|
||||
dataset.id2label_dict[str(id)]
|
||||
for id in crf_decode[sent_index][1:seq_lens[sent_index] - 1]
|
||||
]
|
||||
|
||||
sent_out = []
|
||||
tags_out = []
|
||||
parital_word = ""
|
||||
for ind, tag in enumerate(tags):
|
||||
# for the first word
|
||||
if parital_word == "":
|
||||
parital_word = sent[ind]
|
||||
tags_out.append(tag.split('-')[0])
|
||||
continue
|
||||
|
||||
# for the beginning of word
|
||||
if tag.endswith("-B") or (tag == "O" and tags[ind - 1] != "O"):
|
||||
sent_out.append(parital_word)
|
||||
tags_out.append(tag.split('-')[0])
|
||||
parital_word = sent[ind]
|
||||
continue
|
||||
|
||||
parital_word += sent[ind]
|
||||
|
||||
# append the last word, except for len(tags)=0
|
||||
if len(sent_out) < len(tags_out):
|
||||
sent_out.append(parital_word)
|
||||
|
||||
batch_out.append([sent_out, tags_out])
|
||||
return batch_out
|
||||
|
||||
|
||||
def init_checkpoint(exe, init_checkpoint_path, main_program):
|
||||
"""
|
||||
Init CheckPoint
|
||||
"""
|
||||
assert os.path.exists(
|
||||
init_checkpoint_path), "[%s] cann't be found." % init_checkpoint_path
|
||||
|
||||
def existed_persitables(var):
|
||||
"""
|
||||
If existed presitabels
|
||||
"""
|
||||
if not fluid.io.is_persistable(var):
|
||||
return False
|
||||
return os.path.exists(os.path.join(init_checkpoint_path, var.name))
|
||||
|
||||
fluid.io.load_vars(
|
||||
exe,
|
||||
init_checkpoint_path,
|
||||
main_program=main_program,
|
||||
predicate=existed_persitables)
|
||||
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue