Skip to content

Commit be44ad1

Browse files
committed
adding nlp algorithm
1 parent a6a75d2 commit be44ad1

12 files changed

+1381
-0
lines changed
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
from typing import Dict
2+
from collections import Counter
3+
import re
4+
5+
class WordFrequencyCounter:
6+
7+
def count_word_frequency_counter(self, text:str) -> Dict[str, int]:
8+
"""
9+
Count the frequency of each word in a text document.
10+
11+
Args:
12+
text: Input text document
13+
14+
Returns:
15+
Dictionary with word frequencies
16+
"""
17+
18+
words = re.findall(r'\b\w+\b', text)
19+
20+
word_freq = Counter(words)
21+
22+
return word_freq
23+
24+
def count_word_frequency_counter_with_stop_words(self, text: str) -> Dict[str, int]:
25+
"""
26+
Count the frequency of each word in a text document.
27+
28+
Args:
29+
text: Input text document
30+
31+
Returns:
32+
Dictionary with word frequencies
33+
"""
34+
35+
words = re.split(r'[,.]|\s', text)
36+
37+
word_freq = Counter(words)
38+
39+
return word_freq
40+
41+
42+
43+
44+
# Example usage
45+
if __name__ == "__main__":
46+
counter = WordFrequencyCounter()
47+
48+
text = """
49+
Natural language processing (NLP) is a subfield of linguistics,
50+
computer science, and artificial intelligence concerned with the
51+
interactions between computers and human language.
52+
"""
53+
54+
print(counter.count_word_frequency_counter(text=text))
55+
print(counter.count_word_frequency_counter_with_stop_words(text=text))
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
from typing import List
2+
from collections import Counter
3+
import re
4+
5+
class TextCleanerTokenizer:
6+
7+
def clean_and_tokenize_advanced(self, text: str, remove_numbers: bool = False) -> List[str]:
8+
"""
9+
Advanced cleaning and tokenization with additional options.
10+
11+
Args:
12+
text: Input text to clean and tokenize
13+
remove_numbers: If True, remove numeric tokens
14+
15+
Returns:
16+
List of cleaned tokens
17+
"""
18+
19+
# Convert to lowercase
20+
text = text.lower()
21+
22+
# Remove punctuation
23+
text = re.sub(pattern=r'[^\w\s]', repl='', string=text)
24+
25+
# Remove extra whitespace
26+
text = re.sub(pattern=r'\s+', repl=' ', string=text).strip()
27+
28+
# Tokenize
29+
tokens = text.split()
30+
31+
# Optionally remove numbers
32+
if remove_numbers:
33+
tokens = [token for token in tokens if not token.isdigit()]
34+
35+
return tokens
36+
37+
38+
def clean_with_regex_tokenize(self, text: str) -> List[str]:
39+
"""
40+
Clean and tokenize using regex word boundaries.
41+
42+
Args:
43+
text: Input text to clean and tokenize
44+
45+
Returns:
46+
List of cleaned tokens
47+
"""
48+
49+
# Convert to lowercase
50+
text = text.lower()
51+
52+
# Extract using regex
53+
54+
token = re.findall(pattern=r'\b\w+\b', string=text)
55+
56+
return token
57+
58+
59+
60+
# Example usage
61+
if __name__ == "__main__":
62+
cleaner = TextCleanerTokenizer()
63+
64+
text = """
65+
Hello, World! This is a TEST sentence with punctuation...
66+
Numbers like 123 and symbols @#$ should be handled properly.
67+
Multiple spaces should be normalized.
68+
"""
69+
70+
solution = TextCleanerTokenizer()
71+
72+
print(solution.clean_and_tokenize_advanced(text=text,
73+
remove_numbers=True))
74+
print(solution.clean_with_regex_tokenize(text=text))
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
from typing import List, Set
2+
import re
3+
4+
class StopwordRemoval:
5+
6+
# Common English stopwords
7+
STOPWORDS = {
8+
'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from',
9+
'has', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the',
10+
'to', 'was', 'will', 'with', 'the', 'this', 'but', 'they', 'have',
11+
'had', 'what', 'when', 'where', 'who', 'which', 'why', 'how'
12+
}
13+
14+
def __init__(self, custom_stopwords: Set[str] = None):
15+
"""
16+
Initialize with optional custom stopwords.
17+
18+
Args:
19+
custom_stopwords: Optional set of additional stopwords
20+
"""
21+
self.stopwords = self.STOPWORDS.copy()
22+
if custom_stopwords:
23+
self.stopwords.update(custom_stopwords)
24+
25+
26+
27+
def remove_stopwords(self, text: str) -> str:
28+
"""
29+
Remove stopwords from text and return cleaned string.
30+
31+
Args:
32+
text: Input text
33+
34+
Returns:
35+
Text with stopwords removed
36+
"""
37+
38+
# Convert to lowercase and tokenize
39+
words = re.findall(r'\b\w+\b', text.lower())
40+
41+
42+
# Filtered out stopwords
43+
filtered_words = [word for word in words if word not in self.stopwords]
44+
45+
return ' '.join(filtered_words)
46+
47+
48+
49+
50+
51+
52+
# Example usage
53+
if __name__ == "__main__":
54+
remover = StopwordRemoval()
55+
56+
text = """
57+
The quick brown fox jumps over the lazy dog.
58+
This is a sample sentence with many common stopwords that
59+
will be removed from the text.
60+
"""
61+
62+
print(remover.remove_stopwords(text=text))
Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
from typing import Dict, Literal, List
2+
import re
3+
4+
# ============================================================================
5+
# METHOD 1: Using Transformers (BERT, RoBERTa, DistilBERT)
6+
# ============================================================================
7+
class TransformerSentimentAnalyzer:
8+
"""
9+
Sentiment analysis using pre-trained transformer models.
10+
Install: pip install transformers torch
11+
"""
12+
13+
def __init__(self, model_name: str = "distilbert-base-uncased-finetuned-sst-2-english"):
14+
"""
15+
Initialize with a pre-trained model.
16+
17+
Popular models:
18+
- distilbert-base-uncased-finetuned-sst-2-english (fast, lightweight)
19+
- cardiffnlp/twitter-roberta-base-sentiment (social media)
20+
- nlptown/bert-base-multilingual-uncased-sentiment (multilingual)
21+
"""
22+
from transformers import pipeline
23+
self.classifier = pipeline('sentiment-analysis', model=model_name)
24+
25+
26+
def analyze(self, text: str) -> Dict:
27+
"""Analyze sentiment using transformer model."""
28+
result = self.classifier(text)[0]
29+
return {
30+
'sentiment': result['label'].lower(),
31+
'confidence': round(result['score'], 4)
32+
}
33+
34+
35+
def batch_analyze(self, texts: List[str]) -> List[Dict]:
36+
"""Analyze multiple texts efficiently."""
37+
results = self.classifier(text)
38+
39+
return [
40+
{
41+
'sentiment':r['label'].lower(),
42+
'confidence': round(r['score'], 4)
43+
}
44+
for r in results
45+
]
46+
47+
48+
49+
# ============================================================================
50+
# TESTING AND EXAMPLES
51+
# ============================================================================
52+
if __name__ == "__main__":
53+
54+
print("="*80)
55+
print("TRANSFORMER SENTIMENT ANALYSIS - TESTING")
56+
print("="*80 + "\n")
57+
58+
# Initialize the analyzer
59+
print("Loading model... (this may take a moment on first run)")
60+
analyzer = TransformerSentimentAnalyzer()
61+
print("Model loaded successfully!\n")
62+
63+
# ========================================================================
64+
# TEST 1: Single Text Analysis
65+
# ========================================================================
66+
print("TEST 1: Single Text Analysis")
67+
print("-"*80)
68+
69+
single_text = "This movie is absolutely amazing! I loved every minute of it."
70+
result = analyzer.analyze(single_text)
71+
72+
print(f"Text: {single_text}")
73+
print(f"Sentiment: {result['sentiment'].upper()}")
74+
print(f"Confidence: {result['confidence']*100:.2f}%\n")
75+
76+
# ========================================================================
77+
# TEST 2: Multiple Different Sentiments
78+
# ========================================================================
79+
print("\nTEST 2: Multiple Different Sentiments")
80+
print("-"*80)
81+
82+
test_cases = [
83+
"I absolutely love this product! It's the best thing ever!",
84+
"This is terrible. Worst experience of my life.",
85+
"It's okay, nothing special but not bad either.",
86+
"The customer service was outstanding and very helpful.",
87+
"I'm so disappointed and frustrated with this purchase.",
88+
]
89+
90+
for i, text in enumerate(test_cases, 1):
91+
result = analyzer.analyze(text)
92+
print(f"\n{i}. Text: {text}")
93+
print(f" Sentiment: {result['sentiment'].upper()}")
94+
print(f" Confidence: {result['confidence']*100:.2f}%")
95+
96+
# ========================================================================
97+
# TEST 3: Batch Analysis (More Efficient)
98+
# ========================================================================
99+
print("\n\nTEST 3: Batch Analysis")
100+
print("-"*80)
101+
102+
batch_texts = [
103+
"Great service!",
104+
"Horrible experience.",
105+
"Not impressed at all.",
106+
"Fantastic product, highly recommend!",
107+
"Could be better, but acceptable."
108+
]
109+
110+
batch_results = analyzer.batch_analyze(batch_texts)
111+
112+
for text, result in zip(batch_texts, batch_results):
113+
print(f"\nText: {text}")
114+
print(f"Sentiment: {result['sentiment'].upper()} ({result['confidence']*100:.2f}%)")
115+
116+
# ========================================================================
117+
# TEST 4: Edge Cases
118+
# ========================================================================
119+
print("\n\nTEST 4: Edge Cases")
120+
print("-"*80)
121+
122+
edge_cases = [
123+
"😊❤️", # Emojis
124+
"Not bad at all!", # Negation
125+
"I don't hate it.", # Double negation
126+
"", # Empty (will handle gracefully)
127+
"Meh.", # Ambiguous
128+
]
129+
130+
for text in edge_cases:
131+
if text: # Skip empty strings
132+
result = analyzer.analyze(text)
133+
print(f"\nText: '{text}'")
134+
print(f"Sentiment: {result['sentiment'].upper()}")
135+
print(f"Confidence: {result['confidence']*100:.2f}%")
136+
137+
# ========================================================================
138+
# TEST 5: Summary Statistics
139+
# ========================================================================
140+
print("\n\n" + "="*80)
141+
print("SUMMARY STATISTICS")
142+
print("="*80)
143+
144+
all_texts = test_cases + batch_texts
145+
all_results = analyzer.batch_analyze(all_texts)
146+
147+
positive_count = sum(1 for r in all_results if r['sentiment'] == 'positive')
148+
negative_count = sum(1 for r in all_results if r['sentiment'] == 'negative')
149+
avg_confidence = sum(r['confidence'] for r in all_results) / len(all_results)
150+
151+
print(f"\nTotal texts analyzed: {len(all_texts)}")
152+
print(f"Positive sentiments: {positive_count}")
153+
print(f"Negative sentiments: {negative_count}")
154+
print(f"Average confidence: {avg_confidence*100:.2f}%")
155+
156+
print("\n" + "="*80)
157+
print("Testing completed!")
158+
print("="*80)

0 commit comments

Comments
 (0)