nltk 获取 gutenberg 语料
gensim 生成词库和 onehot 编码
正在尝试基于 Tensorflow LSTM 模型开发另外一个项目,需要自然语言处理的工具和语料。
import nltk
import numpy as np
from nltk.corpus import gutenberg
from gensim import corpora, models, similarities
class Book2Array(object):
sentences=None
token2id_dic=None
def __init__(self,sentences):
self.sentences=sentences
self.token2id_dic=self.get_token2id_dic()
def get_sentences(self):
#macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')
#print(macbeth_sentences)
#print(type(macbeth_sentences))
print(len(macbeth_sentences))
sentences_list=[sentence for sentence in self.sentences]
#print(type(macbeth_list))
return sentences_list
def get_token2id_dic(self):
# collect statistics about all tokens
dictionary = corpora.Dictionary(self.sentences)
# remove stop words and words that appear only once
# remove gaps in id sequence after words that were removed
print(len(dictionary))
token2id_dic=dictionary.token2id
return token2id_dic
def word2onehot(self,word):
onehot_list=np.zeros(8192)
onehot_list[self.token2id_dic[word]]=1
return onehot_list
def sent2vec(self,sentence):
vec=[]
if(len(sentence)>20):
sentence=sentence[0:20]
for word in sentence:
onehot_list=self.word2onehot(word)
vec.append(onehot_list)
len_vec=len(vec)
for i in range(0,20-len_vec):
vec.append(np.zeros(8192))
#print(len(vec))
vec_np=np.asarray(vec)
return vec_np
def sentences2array(self):
array=[]
for sentence in self.sentences:
array.append(self.sent2vec(sentence))
return array
def gen_batch(self):
pass
if __name__ == '__main__':
macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')
book_array=Book2Array(macbeth_sentences)
book_array.get_sentences()
array=book_array.sentences2array()
np_array=np.array(array[0])
print(np_array.shape)