-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathapp.py
More file actions
102 lines (80 loc) · 3.33 KB
/
app.py
File metadata and controls
102 lines (80 loc) · 3.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# -*- coding: utf-8 -*-
import pandas as pd
from flask import Flask, jsonify, request
import streamlit as st
import pandas as pd
import numpy as np
import re, string
import pickle
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import sys
import warnings
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
model = pickle.load(open('model1.pkl','rb'))
def preprocess(sentence):
if not sys.warnoptions:
warnings.simplefilter("ignore")
def cleanHtml(sentence):
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, ' ', str(sentence))
return cleantext
def cleanPunc(sentence): #function to clean the word of any punctuation or special characters
cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
cleaned = cleaned.strip()
cleaned = cleaned.replace("\n"," ")
return cleaned
def keepAlpha(sentence):
alpha_sent = ""
for word in sentence.split():
alpha_word = re.sub('[^a-z A-Z]+', ' ', word)
alpha_sent += alpha_word
alpha_sent += " "
alpha_sent = alpha_sent.strip()
return alpha_sent
def removeStopWords(re_stop_words, sentence):
# global re_stop_words
return re_stop_words.sub(" ", sentence)
def Stemmer(sentence):
stem_list = [sn_stemmer.stem(word) for word in sentence.split()]
return ' '.join(stem_list)
sentence = cleanHtml(sentence)
sentence = cleanPunc(sentence)
sentence = keepAlpha(sentence)
stop_words = set(stopwords.words('english'))
stop_words.update(['zero','one','two','three','four','five','six','seven','eight','nine','ten','may','also','across','among','beside','however','yet','within'])
re_stop_words = re.compile(r"\b(" + "|".join(stop_words) + ")\\W", re.I)
sentence = removeStopWords(re_stop_words, sentence)
sn_stemmer = SnowballStemmer('english')
sentence = Stemmer(sentence)
return sentence
def predict(comment):
categories = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
'identity_hate']
label_array = np.array(categories).reshape(1,6)
output_array = np.zeros((1,len(categories)))
pre_Sentence = preprocess(comment)
bow_input_comment = model['vec'].transform([pre_Sentence])
for i, category in enumerate(categories):
r = model['r'][category]
modified_input = bow_input_comment.multiply(r)
output_array[:, i] = model['m'][category].predict_proba(modified_input)[:,1]
# output_array>0.9
return label_array[output_array>0.5]
def main():
st.title('Toxic Comment Classifier')
comment_input = st.text_area("Enter the Comment to Classify")
result = ''
if st.button('Predict'):
result = predict(comment_input)
if len(result) > 0:
show_result = "Given Comment :" + comment_input + " contains " + ', '.join(result)
st.success(show_result)
else:
st.success("Given Comment :" + comment_input + " contains " + "No toxic elements")
if __name__=='__main__':
main()