-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathcutWords.py
More file actions
77 lines (63 loc) · 2.6 KB
/
cutWords.py
File metadata and controls
77 lines (63 loc) · 2.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# ~~~~~~~~~~~~~~~~~~~~
# Code by Anleo YUAN
# Please visit https://www.anleo.top
# Copyrights reserved by Chongqing University
# ~~~~~~~~~~~~~~~~~~~~
# 将语料库中的文本进行分词,并且去除停用词
# coding=utf-8
import jieba
import jieba.posseg as pseg
import time
import os
# 文本分词
def cutText(dirname):
# dirname数据目录
for category in os.listdir(dirname):
catdir = os.path.join(dirname, category)
if not os.path.isdir(catdir):
continue
files = os.listdir(catdir)
i = 0
k = 10200 # 设置每类进行分词的文本数
for cur_file in files:
if k != 0:
# print("正在处理" + category + "中的第" + str(i) + "个文件.......")
filename = os.path.join(catdir, cur_file)
# 读取文本
with open(filename, "r", encoding='utf-8') as f:
content = f.read()
# 进行分词
words = pseg.cut(content)
# 用于剔除停用词的列表
finalContent = []
# 停用词列表
stopWords = [line.strip() for line in open('Chinesestopword.txt', 'r', encoding='utf-8').readlines()]
for word in words:
word = str(word.word)
# 如果该单词非空格、换行符、不在听用词表中就将其添加进入最终分词列表中
if len(word) > 1 and word != '\n' and word != '\u3000' and word not in stopWords:
finalContent.append(word)
# 组合成最终需要的字符串
finalStr = " ".join(finalContent)
# 创建文件夹
writeDir = os.path.join(writeFilePathPrefix, category)
if not os.path.exists(writeDir):
os.mkdir(writeDir)
# 写入文件
writeFileName = writeFilePathPrefix + "/" + category + "/" + str(i) + ".txt"
# print(writeFileName)
with open(writeFileName, "w", encoding='utf-8') as f:
f.write(finalStr)
i = i + 1
k = k - 1
# print("成功处理" + category + "中的第" + str(i) + "个文件~")
if __name__ == '__main__':
# 记录开始时间
t1 = time.time()
readFilePathPrefix = "E:/THUCNews"
writeFilePathPrefix = "E:/cutTHUCNews"
cutText(readFilePathPrefix)
# 记录结束时间
t2 = time.time()
# 反馈结果
print("您的分词终于完成,耗时:" + str(t2 - t1) + "秒。")