ivan1016017
diff --git a/‎src/my_project/interviews/common_nlp_coding_questions/round_3/01_word_frequency_counter.py‎
Lines changed: 55 additions & 0 deletions b/‎src/my_project/interviews/common_nlp_coding_questions/round_3/01_word_frequency_counter.py‎
Lines changed: 55 additions & 0 deletions
diff --git a/‎src/my_project/interviews/common_nlp_coding_questions/round_3/02_text_cleaning_and_tokenization.py‎
Lines changed: 74 additions & 0 deletions b/‎src/my_project/interviews/common_nlp_coding_questions/round_3/02_text_cleaning_and_tokenization.py‎
Lines changed: 74 additions & 0 deletions
diff --git a/‎src/my_project/interviews/common_nlp_coding_questions/round_3/03_stopword_removal.py‎
Lines changed: 62 additions & 0 deletions b/‎src/my_project/interviews/common_nlp_coding_questions/round_3/03_stopword_removal.py‎
Lines changed: 62 additions & 0 deletions
diff --git a/‎src/my_project/interviews/common_nlp_coding_questions/round_3/04_sentiment_analysis.py‎
Lines changed: 158 additions & 0 deletions b/‎src/my_project/interviews/common_nlp_coding_questions/round_3/04_sentiment_analysis.py‎
Lines changed: 158 additions & 0 deletions
@@ -0,0 +1,55 @@
+from typing import Dict 
+from collections import Counter
+import re
+
+class WordFrequencyCounter:
+
+    def count_word_frequency_counter(self, text:str) -> Dict[str, int]:
+        """
+        Count the frequency of each word in a text document.
+        
+        Args:
+            text: Input text document
+            
+        Returns:
+            Dictionary with word frequencies
+        """
+
+        words = re.findall(r'\b\w+\b', text)   
+
+        word_freq = Counter(words)
+
+        return word_freq
+    
+    def count_word_frequency_counter_with_stop_words(self, text: str) -> Dict[str, int]:
+        """
+        Count the frequency of each word in a text document.
+        
+        Args:
+            text: Input text document
+            
+        Returns:
+            Dictionary with word frequencies
+        """
+
+        words = re.split(r'[,.]|\s', text)             
+
+        word_freq = Counter(words)
+
+        return word_freq
+
+
+
+
+# Example usage
+if __name__ == "__main__":
+    counter = WordFrequencyCounter()
+    
+    text = """
+    Natural language processing (NLP) is a subfield of linguistics, 
+    computer science, and artificial intelligence concerned with the 
+    interactions between computers and human language.
+    """
+
+    print(counter.count_word_frequency_counter(text=text))
+    print(counter.count_word_frequency_counter_with_stop_words(text=text))
@@ -0,0 +1,74 @@
+from typing import List
+from collections import Counter
+import re
+
+class TextCleanerTokenizer:
+
+    def clean_and_tokenize_advanced(self, text: str, remove_numbers: bool = False) -> List[str]:
+        """
+        Advanced cleaning and tokenization with additional options.
+        
+        Args:
+            text: Input text to clean and tokenize
+            remove_numbers: If True, remove numeric tokens
+            
+        Returns:
+            List of cleaned tokens
+        """
+
+        # Convert to lowercase
+        text = text.lower()
+
+        # Remove punctuation
+        text = re.sub(pattern=r'[^\w\s]', repl='', string=text)
+
+        # Remove extra whitespace
+        text = re.sub(pattern=r'\s+', repl=' ', string=text).strip()
+
+        # Tokenize
+        tokens = text.split()
+
+        # Optionally remove numbers
+        if remove_numbers:
+            tokens = [token for token in tokens if not token.isdigit()]
+
+        return tokens
+    
+    
+    def clean_with_regex_tokenize(self, text: str) -> List[str]:
+        """
+        Clean and tokenize using regex word boundaries.
+        
+        Args:
+            text: Input text to clean and tokenize
+            
+        Returns:
+            List of cleaned tokens
+        """
+
+        # Convert to lowercase
+        text = text.lower()
+
+        # Extract using regex
+
+        token = re.findall(pattern=r'\b\w+\b', string=text)
+
+        return token
+
+
+
+# Example usage
+if __name__ == "__main__":
+    cleaner = TextCleanerTokenizer()
+    
+    text = """
+    Hello, World! This is a TEST sentence with punctuation... 
+    Numbers like 123 and symbols @#$ should be handled properly.
+    Multiple    spaces   should be normalized.
+    """
+
+    solution = TextCleanerTokenizer()
+
+    print(solution.clean_and_tokenize_advanced(text=text,
+                                               remove_numbers=True))
+    print(solution.clean_with_regex_tokenize(text=text))
@@ -0,0 +1,62 @@
+from typing import List, Set
+import re
+
+class StopwordRemoval:
+    
+    # Common English stopwords
+    STOPWORDS = {
+        'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from',
+        'has', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the',
+        'to', 'was', 'will', 'with', 'the', 'this', 'but', 'they', 'have',
+        'had', 'what', 'when', 'where', 'who', 'which', 'why', 'how'
+    }
+    
+    def __init__(self, custom_stopwords: Set[str] = None):
+        """
+        Initialize with optional custom stopwords.
+        
+        Args:
+            custom_stopwords: Optional set of additional stopwords
+        """
+        self.stopwords = self.STOPWORDS.copy()
+        if custom_stopwords:
+            self.stopwords.update(custom_stopwords)
+
+
+
+    def remove_stopwords(self, text: str) -> str:
+        """
+        Remove stopwords from text and return cleaned string.
+        
+        Args:
+            text: Input text
+            
+        Returns:
+            Text with stopwords removed
+        """
+
+        # Convert to lowercase and tokenize
+        words = re.findall(r'\b\w+\b', text.lower())
+
+
+        # Filtered out stopwords
+        filtered_words = [word for word in words if word not in self.stopwords]
+
+        return ' '.join(filtered_words)
+
+
+
+
+    
+
+# Example usage
+if __name__ == "__main__":
+    remover = StopwordRemoval()
+    
+    text = """
+    The quick brown fox jumps over the lazy dog. 
+    This is a sample sentence with many common stopwords that 
+    will be removed from the text.
+    """
+
+    print(remover.remove_stopwords(text=text))    
@@ -0,0 +1,158 @@
+from typing import Dict, Literal, List
+import re
+
+# ============================================================================
+# METHOD 1: Using Transformers (BERT, RoBERTa, DistilBERT)
+# ============================================================================
+class TransformerSentimentAnalyzer:
+    """
+    Sentiment analysis using pre-trained transformer models.
+    Install: pip install transformers torch
+    """
+    
+    def __init__(self, model_name: str = "distilbert-base-uncased-finetuned-sst-2-english"):
+        """
+        Initialize with a pre-trained model.
+        
+        Popular models:
+        - distilbert-base-uncased-finetuned-sst-2-english (fast, lightweight)
+        - cardiffnlp/twitter-roberta-base-sentiment (social media)
+        - nlptown/bert-base-multilingual-uncased-sentiment (multilingual)
+        """
+        from transformers import pipeline
+        self.classifier = pipeline('sentiment-analysis', model=model_name)
+
+    
+    def analyze(self, text: str) -> Dict:
+        """Analyze sentiment using transformer model."""
+        result = self.classifier(text)[0]
+        return {
+            'sentiment': result['label'].lower(),
+            'confidence': round(result['score'], 4)
+        }
+
+    
+    def batch_analyze(self, texts: List[str]) -> List[Dict]:
+        """Analyze multiple texts efficiently."""
+        results = self.classifier(text)
+
+        return [
+            {
+                'sentiment':r['label'].lower(),
+                'confidence': round(r['score'], 4)
+            }
+            for r in results
+        ]
+
+
+
+# ============================================================================
+# TESTING AND EXAMPLES
+# ============================================================================
+if __name__ == "__main__":
+    
+    print("="*80)
+    print("TRANSFORMER SENTIMENT ANALYSIS - TESTING")
+    print("="*80 + "\n")
+    
+    # Initialize the analyzer
+    print("Loading model... (this may take a moment on first run)")
+    analyzer = TransformerSentimentAnalyzer()
+    print("Model loaded successfully!\n")
+    
+    # ========================================================================
+    # TEST 1: Single Text Analysis
+    # ========================================================================
+    print("TEST 1: Single Text Analysis")
+    print("-"*80)
+    
+    single_text = "This movie is absolutely amazing! I loved every minute of it."
+    result = analyzer.analyze(single_text)
+    
+    print(f"Text: {single_text}")
+    print(f"Sentiment: {result['sentiment'].upper()}")
+    print(f"Confidence: {result['confidence']*100:.2f}%\n")
+    
+    # ========================================================================
+    # TEST 2: Multiple Different Sentiments
+    # ========================================================================
+    print("\nTEST 2: Multiple Different Sentiments")
+    print("-"*80)
+    
+    test_cases = [
+        "I absolutely love this product! It's the best thing ever!",
+        "This is terrible. Worst experience of my life.",
+        "It's okay, nothing special but not bad either.",
+        "The customer service was outstanding and very helpful.",
+        "I'm so disappointed and frustrated with this purchase.",
+    ]
+    
+    for i, text in enumerate(test_cases, 1):
+        result = analyzer.analyze(text)
+        print(f"\n{i}. Text: {text}")
+        print(f"   Sentiment: {result['sentiment'].upper()}")
+        print(f"   Confidence: {result['confidence']*100:.2f}%")
+    
+    # ========================================================================
+    # TEST 3: Batch Analysis (More Efficient)
+    # ========================================================================
+    print("\n\nTEST 3: Batch Analysis")
+    print("-"*80)
+    
+    batch_texts = [
+        "Great service!",
+        "Horrible experience.",
+        "Not impressed at all.",
+        "Fantastic product, highly recommend!",
+        "Could be better, but acceptable."
+    ]
+    
+    batch_results = analyzer.batch_analyze(batch_texts)
+    
+    for text, result in zip(batch_texts, batch_results):
+        print(f"\nText: {text}")
+        print(f"Sentiment: {result['sentiment'].upper()} ({result['confidence']*100:.2f}%)")
+    
+    # ========================================================================
+    # TEST 4: Edge Cases
+    # ========================================================================
+    print("\n\nTEST 4: Edge Cases")
+    print("-"*80)
+    
+    edge_cases = [
+        "😊❤️",  # Emojis
+        "Not bad at all!",  # Negation
+        "I don't hate it.",  # Double negation
+        "",  # Empty (will handle gracefully)
+        "Meh.",  # Ambiguous
+    ]
+    
+    for text in edge_cases:
+        if text:  # Skip empty strings
+            result = analyzer.analyze(text)
+            print(f"\nText: '{text}'")
+            print(f"Sentiment: {result['sentiment'].upper()}")
+            print(f"Confidence: {result['confidence']*100:.2f}%")
+    
+    # ========================================================================
+    # TEST 5: Summary Statistics
+    # ========================================================================
+    print("\n\n" + "="*80)
+    print("SUMMARY STATISTICS")
+    print("="*80)
+    
+    all_texts = test_cases + batch_texts
+    all_results = analyzer.batch_analyze(all_texts)
+    
+    positive_count = sum(1 for r in all_results if r['sentiment'] == 'positive')
+    negative_count = sum(1 for r in all_results if r['sentiment'] == 'negative')
+    avg_confidence = sum(r['confidence'] for r in all_results) / len(all_results)
+    
+    print(f"\nTotal texts analyzed: {len(all_texts)}")
+    print(f"Positive sentiments: {positive_count}")
+    print(f"Negative sentiments: {negative_count}")
+    print(f"Average confidence: {avg_confidence*100:.2f}%")
+    
+    print("\n" + "="*80)
+    print("Testing completed!")
+    print("="*80)