-
Notifications
You must be signed in to change notification settings - Fork 768
Sourcery refactored master branch #15
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,8 +17,8 @@ | |
| ResultFileName = "result.txt" # 搜索结果文件名 | ||
|
|
||
| path = '/home/kaifun/PycharmProjects/TextInfoExp/Part1_TF-IDF/' # 原始数据 | ||
| path1 = path + 'data/title_and_abs/' | ||
| newpath = path + "data/pro_keyword/" | ||
| path1 = f'{path}data/title_and_abs/' | ||
| newpath = f"{path}data/pro_keyword/" | ||
|
Comment on lines
-20
to
+21
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Lines
|
||
| newpath2 = path | ||
|
|
||
| # path1 = 'C:/Users/kaifun/Desktop/ass_TIP/TextInfoProcess/Test_one_TF-IDF/data_afterprocess/title_and_abs/' # 处理后的标题和摘要 | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -8,44 +8,39 @@ | |
|
|
||
| def TF_IDF_Compute(file_import_url_temp): | ||
| file_import_url = file_import_url_temp.replace('\\', '/') | ||
| data_source = open(file_import_url, 'r') | ||
| data = data_source.readline() | ||
| word_in_afile_stat = {} | ||
| word_in_allfiles_stat = {} | ||
| files_num = 0 | ||
| while data != "": # 对文件pro_res.txt进行处理 | ||
| data_temp_1 = data.strip("\n").split("\t") # file name and key words of a file | ||
| data_temp_2 = data_temp_1[1].split(",") # key words of a file | ||
| file_name = data_temp_1[0] | ||
| data_temp_len = len(data_temp_2) | ||
| files_num += 1 | ||
| data_dict = {} | ||
| data_dict.clear() | ||
| for word in data_temp_2: | ||
| if word not in word_in_allfiles_stat: | ||
| word_in_allfiles_stat[word] = 1 | ||
| data_dict[word] = 1 | ||
| else: | ||
| if word not in data_dict: # 如果这个单词在这个文件中之前没有出现过 | ||
| with open(file_import_url, 'r') as data_source: | ||
| data = data_source.readline() | ||
| word_in_afile_stat = {} | ||
| word_in_allfiles_stat = {} | ||
| files_num = 0 | ||
| while data != "": # 对文件pro_res.txt进行处理 | ||
| data_temp_1 = data.strip("\n").split("\t") # file name and key words of a file | ||
| data_temp_2 = data_temp_1[1].split(",") # key words of a file | ||
| file_name = data_temp_1[0] | ||
| data_temp_len = len(data_temp_2) | ||
| files_num += 1 | ||
| data_dict = {} | ||
| data_dict.clear() | ||
| for word in data_temp_2: | ||
| if word not in word_in_allfiles_stat: | ||
| word_in_allfiles_stat[word] = 1 | ||
| data_dict[word] = 1 | ||
| elif word not in data_dict: # 如果这个单词在这个文件中之前没有出现过 | ||
| word_in_allfiles_stat[word] += 1 | ||
| data_dict[word] = 1 | ||
|
|
||
| if not word_in_afile_stat.has_key(file_name): | ||
| word_in_afile_stat[file_name] = {} | ||
| if not word_in_afile_stat[file_name].has_key(word): | ||
| word_in_afile_stat[file_name][word] = [] | ||
| word_in_afile_stat[file_name][word].append(data_temp_2.count(word)) | ||
| word_in_afile_stat[file_name][word].append(data_temp_len) | ||
| data = data_source.readline() | ||
| data_source.close() | ||
|
|
||
| if not word_in_afile_stat.has_key(file_name): | ||
| word_in_afile_stat[file_name] = {} | ||
| if not word_in_afile_stat[file_name].has_key(word): | ||
| word_in_afile_stat[file_name][word] = [data_temp_2.count(word), data_temp_len] | ||
| data = data_source.readline() | ||
| # filelist = os.listdir(newpath2) # 取得当前路径下的所有文件 | ||
| TF_IDF_last_result = [] | ||
| if (word_in_afile_stat) and (word_in_allfiles_stat) and (files_num != 0): | ||
| for filename in word_in_afile_stat.keys(): | ||
| for filename, value in word_in_afile_stat.items(): | ||
| TF_IDF_result = {} | ||
| TF_IDF_result.clear() | ||
| for word in word_in_afile_stat[filename].keys(): | ||
| for word in value.keys(): | ||
|
Comment on lines
-11
to
+43
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
| word_n = word_in_afile_stat[filename][word][0] | ||
| word_sum = word_in_afile_stat[filename][word][1] | ||
| with_word_sum = word_in_allfiles_stat[word] | ||
|
|
@@ -56,19 +51,17 @@ def TF_IDF_Compute(file_import_url_temp): | |
|
|
||
| # line = f1.readline() | ||
| TF_IDF_last_result.append(filename) | ||
| TF_IDF_last_result.extend(result_temp[0:10]) | ||
| TF_IDF_last_result.extend(result_temp[:10]) | ||
|
|
||
| # TF_IDF_last_result.append(line) | ||
| TF_IDF_last_result.append('\n') | ||
|
|
||
| f = open("results.txt", "a+") | ||
|
|
||
| for s in TF_IDF_last_result: | ||
| # print s | ||
| for i in s: | ||
| f.write(str(i)) | ||
| f.write("\n") | ||
| f.close() | ||
| with open("results.txt", "a+") as f: | ||
| for s in TF_IDF_last_result: | ||
| # print s | ||
| for i in s: | ||
| f.write(str(i)) | ||
| f.write("\n") | ||
|
|
||
|
|
||
| if __name__ == '__main__': | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -14,10 +14,10 @@ | |
|
|
||
| # path='/home/mbtrec/mhwang/pro/computer/' | ||
| base_path = GrobalParament.path | ||
| path = base_path + 'data/computer/'# 原始数据 | ||
| path1 = base_path + 'data/title_and_abs/' # 处理后的标题和摘要 | ||
| newpath = base_path + 'data/pro_keyword/' | ||
| newpath2 = base_path + 'data/keyword/' | ||
| path = f'{base_path}data/computer/' | ||
| path1 = f'{base_path}data/title_and_abs/' | ||
| newpath = f'{base_path}data/pro_keyword/' | ||
| newpath2 = f'{base_path}data/keyword/' | ||
|
Comment on lines
-17
to
+20
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Lines
This removes the following comments ( why? ): |
||
|
|
||
| filelist = os.listdir(path) # 取得当前路径下的所有文件 | ||
|
|
||
|
|
@@ -29,29 +29,25 @@ def get_text(): | |
| filename = os.path.splitext(files)[0] # 取文件名 | ||
| soup = BeautifulSoup(open(path + filename + '.xml'), 'html.parser') # 解析网页 | ||
| b = soup.find("p", class_="abstracts") # 取得"p", class_="abstracts"为标签的内容 | ||
| # print b | ||
| if b is None or b.string is None: | ||
| continue | ||
| else: | ||
| abstracts.extend(soup.title.stripped_strings) | ||
| s = b.string | ||
| abstracts.extend(s.encode('utf-8')) | ||
| f = open(path1 + filename + ".txt", "w+") # 写入txt文件 | ||
| abstracts.extend(soup.title.stripped_strings) | ||
| s = b.string | ||
| abstracts.extend(s.encode('utf-8')) | ||
| with open(path1 + filename + ".txt", "w+") as f: | ||
| for i in abstracts: | ||
| f.write(i) | ||
| f.close() | ||
| abstracts = [] | ||
| abstracts = [] | ||
|
Comment on lines
-32
to
+40
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
This removes the following comments ( why? ): |
||
|
|
||
| # getPro_keyword,清洗出xml文件中dl标签中的文本信息 | ||
| links = soup.find_all("dl") | ||
| # print links | ||
| for link in links: | ||
| s1 = link.get_text() | ||
| # print s1 | ||
| f = open(newpath + filename + ".txt", "w+") # 将得到的未处理的文字放在pro_keyword文件夹中 | ||
| for i in s1: | ||
| f.write(i) | ||
| f.close() | ||
| with open(newpath + filename + ".txt", "w+") as f: | ||
| for i in s1: | ||
| f.write(i) | ||
|
|
||
|
|
||
| # 对上一步得到的getPro_keyword文件夹中的文件进行进一步处理,得到每个文件的关键字 | ||
|
|
@@ -62,18 +58,17 @@ def get_keyword(): | |
| filename = os.path.splitext(files)[0] | ||
| begin = 100000 | ||
| end = 10000 | ||
| f1 = open(newpath + filename + ".txt", "r") | ||
| f2 = open(newpath2 + filename + '.txt', "w+") | ||
| for (num, value) in enumerate(f1): | ||
| if value.count("关键词") > 0: # 得到关键词的行号 | ||
| begin = num | ||
| if value.count("基金项目") > 0 or value.count("机标分类号") > 0 or value.count("机标关键词") > 0 or value.count( | ||
| "基金项目") > 0 or value.count("DOI") > 0: | ||
| end = num | ||
| if num > begin and num < end and value[:-1].strip(): | ||
| f2.write(value.strip()) | ||
| f2.write(" ") | ||
| f1.close() | ||
| with open(newpath + filename + ".txt", "r") as f1: | ||
| f2 = open(newpath2 + filename + '.txt', "w+") | ||
| for (num, value) in enumerate(f1): | ||
| if value.count("关键词") > 0: # 得到关键词的行号 | ||
| begin = num | ||
| if value.count("基金项目") > 0 or value.count("机标分类号") > 0 or value.count("机标关键词") > 0 or value.count( | ||
| "基金项目") > 0 or value.count("DOI") > 0: | ||
| end = num | ||
| if num > begin and num < end and value[:-1].strip(): | ||
| f2.write(value.strip()) | ||
| f2.write(" ") | ||
|
Comment on lines
-65
to
+71
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
| f2.close() | ||
|
|
||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Function
com_tfrefactored with the following changes:use-fstring-for-concatenation)This removes the following comments ( why? ):