1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154
| import json import os
import torch import torchtext.vocab as Vocab import jieba from tqdm import tqdm import collections from config import opt import numpy as np
def read_cnews(): data = [] labels = [label for label in os.listdir(opt.data_root) if label != '.DS_Store'] print(labels) labels2index = dict(zip(labels, list(range(len(labels))))) index2labels = dict(zip(list(range(len(labels))), labels)) print(labels2index) print(index2labels)
with open('index2labels.json', 'w') as f: json.dump(index2labels, f) with open('labels.json', 'w') as f: json.dump(labels, f)
for label in labels: folder_name = os.path.join(opt.data_root, label) datasub = [] for file in tqdm(os.listdir(folder_name)): with open(os.path.join(folder_name, file), 'rb') as f: review = f.read().decode('utf-8').replace('\n', '').replace('\r', '').replace('\t', '') datasub.append([review, labels2index[label]]) data.append(datasub) return data
def split_data(data): """ 切分数据集为训练集,验证集,测试集 :param data: 数据集 """ train_data, val_data, test_data = [], [], []
for data1 in data: np.random.shuffle(data1) val_data += data1[:200] test_data += data1[200:400] train_data += data1[400:]
np.random.shuffle(train_data) print(len(train_data)) print(len(val_data)) print(len(test_data))
return train_data, val_data, test_data
def stopwords(file_path): with open(file_path, 'r', encoding='utf-8') as f: stopword = [line.strip() for line in f] print(stopword[:5]) return stopword
def get_tokenized(data, stopword): """ :param data:list of [string,label] """
def tokenizer(text): return [tok for tok in jieba.lcut(text) if tok not in stopword]
return [tokenizer(review) for review, _ in data]
def get_vocab(data, stopword): tokenized_data = get_tokenized(data, stopword) counter = collections.Counter([tk for st in tokenized_data for tk in st]) return Vocab.Vocab(counter, min_freq=5, specials=['<pad>', '<unk>'])
def preprocess_imdb(data, vocab, stopword): max_len = 500
def pad(x): return x[:max_len] if len(x) > max_len else x + [0] * (max_len - len(x))
tokenized_data = get_tokenized(data, stopword) features = torch.tensor( [pad([vocab.stoi[word] for word in words]) for words in tokenized_data]) labels = torch.tensor([score for _, score in data]) return features, labels
if __name__ == '__main__': data = read_cnews() print(len(data)) train_data, val_data, test_data = split_data(data) stopword = stopwords('./cn_stopwords.txt')
vocab = get_vocab(train_data, stopword) print(vocab.itos[:5]) with open('word2index.json', 'w', encoding='utf-8') as f: json.dump(vocab.stoi, f) print('---------------------') print(len(vocab)) print(len(vocab.itos)) print(len(vocab.stoi)) print('---------------------')
X_train, y_train = preprocess_imdb(train_data, vocab, stopword) X_val, y_val = preprocess_imdb(val_data, vocab, stopword) X_test, y_test = preprocess_imdb(test_data, vocab, stopword)
print('---------------------') print(len(vocab)) print(len(vocab.itos)) print(len(vocab.stoi)) print("---------------------")
with open('vocabsize.json', 'w') as f: json.dump(len(vocab), f)
torch.save(X_train, 'X_train.pt') torch.save(y_train, 'y_train.pt') torch.save(X_val, 'X_val.pt') torch.save(y_val, 'y_val.pt') torch.save(X_test, 'X_test.pt') torch.save(y_test, 'y_test.pt')
print(X_train.shape, X_train[0]) print(y_train.shape) print(X_val.shape) print(y_val.shape) print(X_test.shape) print(y_test.shape)
|