Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Part1_TF-IDF/example.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def com_tf():

# 统计结果写入result.txt(字典的遍历)
for (k, v) in num_dict.items():
open('data/result.txt', 'a+').write(str(k) + ' ' + str(v) + '\n') # 将k,v转换为str类型
open('data/result.txt', 'a+').write(f'{str(k)} {str(v)}' + '\n')
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function com_tf refactored with the following changes:

This removes the following comments ( why? ):

# 将k,v转换为str类型



if __name__ == '__main__':
Expand Down
4 changes: 2 additions & 2 deletions Part1_TF-IDF/src/GrobalParament.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
ResultFileName = "result.txt" # 搜索结果文件名

path = '/home/kaifun/PycharmProjects/TextInfoExp/Part1_TF-IDF/' # 原始数据
path1 = path + 'data/title_and_abs/'
newpath = path + "data/pro_keyword/"
path1 = f'{path}data/title_and_abs/'
newpath = f"{path}data/pro_keyword/"
Comment on lines -20 to +21
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lines 20-21 refactored with the following changes:

newpath2 = path

# path1 = 'C:/Users/kaifun/Desktop/ass_TIP/TextInfoProcess/Test_one_TF-IDF/data_afterprocess/title_and_abs/' # 处理后的标题和摘要
Expand Down
71 changes: 32 additions & 39 deletions Part1_TF-IDF/src/get_TF_IDF.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,44 +8,39 @@

def TF_IDF_Compute(file_import_url_temp):
file_import_url = file_import_url_temp.replace('\\', '/')
data_source = open(file_import_url, 'r')
data = data_source.readline()
word_in_afile_stat = {}
word_in_allfiles_stat = {}
files_num = 0
while data != "": # 对文件pro_res.txt进行处理
data_temp_1 = data.strip("\n").split("\t") # file name and key words of a file
data_temp_2 = data_temp_1[1].split(",") # key words of a file
file_name = data_temp_1[0]
data_temp_len = len(data_temp_2)
files_num += 1
data_dict = {}
data_dict.clear()
for word in data_temp_2:
if word not in word_in_allfiles_stat:
word_in_allfiles_stat[word] = 1
data_dict[word] = 1
else:
if word not in data_dict: # 如果这个单词在这个文件中之前没有出现过
with open(file_import_url, 'r') as data_source:
data = data_source.readline()
word_in_afile_stat = {}
word_in_allfiles_stat = {}
files_num = 0
while data != "": # 对文件pro_res.txt进行处理
data_temp_1 = data.strip("\n").split("\t") # file name and key words of a file
data_temp_2 = data_temp_1[1].split(",") # key words of a file
file_name = data_temp_1[0]
data_temp_len = len(data_temp_2)
files_num += 1
data_dict = {}
data_dict.clear()
for word in data_temp_2:
if word not in word_in_allfiles_stat:
word_in_allfiles_stat[word] = 1
data_dict[word] = 1
elif word not in data_dict: # 如果这个单词在这个文件中之前没有出现过
word_in_allfiles_stat[word] += 1
data_dict[word] = 1

if not word_in_afile_stat.has_key(file_name):
word_in_afile_stat[file_name] = {}
if not word_in_afile_stat[file_name].has_key(word):
word_in_afile_stat[file_name][word] = []
word_in_afile_stat[file_name][word].append(data_temp_2.count(word))
word_in_afile_stat[file_name][word].append(data_temp_len)
data = data_source.readline()
data_source.close()

if not word_in_afile_stat.has_key(file_name):
word_in_afile_stat[file_name] = {}
if not word_in_afile_stat[file_name].has_key(word):
word_in_afile_stat[file_name][word] = [data_temp_2.count(word), data_temp_len]
data = data_source.readline()
# filelist = os.listdir(newpath2) # 取得当前路径下的所有文件
TF_IDF_last_result = []
if (word_in_afile_stat) and (word_in_allfiles_stat) and (files_num != 0):
for filename in word_in_afile_stat.keys():
for filename, value in word_in_afile_stat.items():
TF_IDF_result = {}
TF_IDF_result.clear()
for word in word_in_afile_stat[filename].keys():
for word in value.keys():
Comment on lines -11 to +43
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function TF_IDF_Compute refactored with the following changes:

word_n = word_in_afile_stat[filename][word][0]
word_sum = word_in_afile_stat[filename][word][1]
with_word_sum = word_in_allfiles_stat[word]
Expand All @@ -56,19 +51,17 @@ def TF_IDF_Compute(file_import_url_temp):

# line = f1.readline()
TF_IDF_last_result.append(filename)
TF_IDF_last_result.extend(result_temp[0:10])
TF_IDF_last_result.extend(result_temp[:10])

# TF_IDF_last_result.append(line)
TF_IDF_last_result.append('\n')

f = open("results.txt", "a+")

for s in TF_IDF_last_result:
# print s
for i in s:
f.write(str(i))
f.write("\n")
f.close()
with open("results.txt", "a+") as f:
for s in TF_IDF_last_result:
# print s
for i in s:
f.write(str(i))
f.write("\n")


if __name__ == '__main__':
Expand Down
51 changes: 23 additions & 28 deletions Part1_TF-IDF/src/get_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@

# path='/home/mbtrec/mhwang/pro/computer/'
base_path = GrobalParament.path
path = base_path + 'data/computer/'# 原始数据
path1 = base_path + 'data/title_and_abs/' # 处理后的标题和摘要
newpath = base_path + 'data/pro_keyword/'
newpath2 = base_path + 'data/keyword/'
path = f'{base_path}data/computer/'
path1 = f'{base_path}data/title_and_abs/'
newpath = f'{base_path}data/pro_keyword/'
newpath2 = f'{base_path}data/keyword/'
Comment on lines -17 to +20
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lines 17-20 refactored with the following changes:

This removes the following comments ( why? ):

# 处理后的标题和摘要
# 原始数据


filelist = os.listdir(path) # 取得当前路径下的所有文件

Expand All @@ -29,29 +29,25 @@ def get_text():
filename = os.path.splitext(files)[0] # 取文件名
soup = BeautifulSoup(open(path + filename + '.xml'), 'html.parser') # 解析网页
b = soup.find("p", class_="abstracts") # 取得"p", class_="abstracts"为标签的内容
# print b
if b is None or b.string is None:
continue
else:
abstracts.extend(soup.title.stripped_strings)
s = b.string
abstracts.extend(s.encode('utf-8'))
f = open(path1 + filename + ".txt", "w+") # 写入txt文件
abstracts.extend(soup.title.stripped_strings)
s = b.string
abstracts.extend(s.encode('utf-8'))
with open(path1 + filename + ".txt", "w+") as f:
for i in abstracts:
f.write(i)
f.close()
abstracts = []
abstracts = []
Comment on lines -32 to +40
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function get_text refactored with the following changes:

This removes the following comments ( why? ):

# 写入txt文件
# 将得到的未处理的文字放在pro_keyword文件夹中
# print b


# getPro_keyword,清洗出xml文件中dl标签中的文本信息
links = soup.find_all("dl")
# print links
for link in links:
s1 = link.get_text()
# print s1
f = open(newpath + filename + ".txt", "w+") # 将得到的未处理的文字放在pro_keyword文件夹中
for i in s1:
f.write(i)
f.close()
with open(newpath + filename + ".txt", "w+") as f:
for i in s1:
f.write(i)


# 对上一步得到的getPro_keyword文件夹中的文件进行进一步处理,得到每个文件的关键字
Expand All @@ -62,18 +58,17 @@ def get_keyword():
filename = os.path.splitext(files)[0]
begin = 100000
end = 10000
f1 = open(newpath + filename + ".txt", "r")
f2 = open(newpath2 + filename + '.txt', "w+")
for (num, value) in enumerate(f1):
if value.count("关键词") > 0: # 得到关键词的行号
begin = num
if value.count("基金项目") > 0 or value.count("机标分类号") > 0 or value.count("机标关键词") > 0 or value.count(
"基金项目") > 0 or value.count("DOI") > 0:
end = num
if num > begin and num < end and value[:-1].strip():
f2.write(value.strip())
f2.write(" ")
f1.close()
with open(newpath + filename + ".txt", "r") as f1:
f2 = open(newpath2 + filename + '.txt', "w+")
for (num, value) in enumerate(f1):
if value.count("关键词") > 0: # 得到关键词的行号
begin = num
if value.count("基金项目") > 0 or value.count("机标分类号") > 0 or value.count("机标关键词") > 0 or value.count(
"基金项目") > 0 or value.count("DOI") > 0:
end = num
if num > begin and num < end and value[:-1].strip():
f2.write(value.strip())
f2.write(" ")
Comment on lines -65 to +71
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function get_keyword refactored with the following changes:

f2.close()


Expand Down
Loading