-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathgetKeytag.py
More file actions
65 lines (55 loc) · 1.94 KB
/
getKeytag.py
File metadata and controls
65 lines (55 loc) · 1.94 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# -*- coding:utf8 -*-
# import jieba.analyse as textrank
import kwExtract
# def loadFile(filename, mode = 1):
# """
# 输入文件名读取文件中内容。
# """
# f = open(filename, 'r')
# if mode ==1:
# article = []
# for line in f.readlines():
# article.append(line.decode('gbk','ignore'))
# elif mode == 2:
# article = ""
# for line in f.readlines():
# article+=(line.strip().decode('gbk','ignore'))
# else:
# print "WRONG MODE CHOOSE"
# articl = ""
# f.close()
# print "Article Loading Complete..."
# return article
def tag_extract(text, withWeight=True, topK = 10, span = 2, threshold = 10):
"""
text 为正文,string类型
topK 表示这一段文字最多产生topK个关键词
span 表示输出的关键术语至少是几个字。
threshold 表示已出现在idf词表中的词语的idf阈值是多大
"""
keyWords = kwExtract.extract_tags(text, withWeight=withWeight, span = span, threshold = threshold)
return keyWords[:topK]
def terminology(content, topK = 10, span = 2, threshold = 5, mode = 1):
"""
输出不多于topK个由该文件产生的术语。
mode: 如果mode值为1则采用分段统计的方法;如果mode值为2则将整篇讲义合成一篇文章进行提取
"""
if mode == 1:
term = {}
num = 0.0
for c in content:
new_term = tag_extract(c, topK = topK, span = 2, threshold = threshold)
for item in new_term:
term[item[0]] = term.get(item[0],0.0)+item[1]
num += 1.0
print "working\t%f"%(num/(len(content)+0.0)*100),"%"
res = sorted(term, key=term.__getitem__, reverse=True)[:topK]
elif mode == 2:
res = tag_extract(content, withWeight = False, topK = topK, span = 2, threshold = threshold)[:topK]
return res
if __name__ == '__main__':
#example.txt是生物进化课的课程讲义。
#example2.txt是经济学思想史讲义
terms = terminology("example2.txt", topK = 10, span = 2, threshold = 5, mode = 1)
for word in terms:
print word.encode('gbk','ignore')