-
Notifications
You must be signed in to change notification settings - Fork 12
Expand file tree
/
Copy pathlanguage_detection.py
More file actions
108 lines (84 loc) · 3.28 KB
/
language_detection.py
File metadata and controls
108 lines (84 loc) · 3.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
from collections import defaultdict
from multiprocessing import Pool
import re
from langdetect import detect
import unicodedata
from nltk.tokenize import RegexpTokenizer
from gensim.parsing.preprocessing import strip_punctuation
import pandas as pd
import db_utilities
from topic_modeling_LDA import split_list
from tqdm import tqdm
# get rid of emoji (faster method)
def deEmojify(text):
regrex_pattern = re.compile(pattern = "["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
u"\U0001f926-\U0001f937"
u"\U00010000-\U0010ffff"
u"\u2640-\u2642"
u"\u2600-\u2B55"
u"\u200d"
u"\u23cf"
u"\u23e9"
u"\u231a"
u"\ufe0f" # dingbats
u"\u3030"
"]+", flags = re.UNICODE)
return regrex_pattern.sub(r'',text)
# preprocess messages
def preprocessDocs(docs):
# Remove emoji and reference to users/channels/groups
docs = [re.sub(r'@[a-zA-Z0-9]+', '', deEmojify(doc)) for doc in docs]
# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
docs[idx] = docs[idx].lower() # Convert to lowercase.
docs[idx] = tokenizer.tokenize(docs[idx]) # Split into words.
# Normalize Unicode String and convert to lowercase
docs = [[unicodedata.normalize('NFKD', token).lower() for token in doc] for doc in docs]
# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]
# Strip punctuation
docs = [[strip_punctuation(token) for token in doc] for doc in docs]
# join tokens in a string
text = [' '.join(doc) for doc in docs]
return text
# detect the language used in a channel
def detect_language(channel):
messages = channel['text_messages']
messages = [messages[key]['message'] for key in messages]
messages = preprocessDocs(messages)
dict_lang = defaultdict(int)
for message in messages:
if len(message) >= 15:
try:
lan = detect(message)
dict_lang[lan] += 1
except:
pass
target_lan = ''
max_counter = 0
for lan in dict_lang:
if dict_lang[lan] > max_counter and dict_lang[lan] > len(messages)/2:
target_lan = lan
max_counter = dict_lang[lan]
return target_lan, channel['_id']
def perform_language_detection(n_portion=100, n_pool=2):
chs_id = db_utilities.get_channel_ids()
portions = split_list(chs_id, n_portion)
results = {'ch_id': [], 'language': []}
for portion in tqdm(portions):
chs = db_utilities.get_channels_by_ids(portion)
with Pool(n_pool) as pool:
for langugage, ch_id in pool.map(detect_language, chs):
results['language'].append(langugage)
results['ch_id'].append(ch_id)
pd.DataFrame(results).to_csv('data/channel_to_language_mapping.csv', index=False)
if __name__ == '__main__':
perform_language_detection()