Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
]
}
},
"initializeCommand": "powershell.exe .\\.devcontainer\\initialize.ps1",
"initializeCommand": "powershell.exe ./.devcontainer/initialize.ps1",
"postCreateCommand": "pip3 install -r ${containerWorkspaceFolder}/requirements-dev.txt",
"remoteUser": "vscode",
"mounts": [
Expand Down
8 changes: 7 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ Note: Pipeline has to be instantiated only once and can be reused.
## Existing preprocessors

### To Lower Case
Name: to_lower
Name: to_lower

Required additional data: -

Expand Down Expand Up @@ -91,6 +91,12 @@ Required additional data: CSV data in string form with the following line format

With this preprocessor you can replace specific words and abbreviations within the text with specified tokens. It is also possible to replace abbreviations ending with a dot. Other special characters are not supported, though.

### Remove signature
Name: remove_signature

Removes greeting expressions and everything following them, as well as thank you expressions.
Should be used before the other processing steps.

## How to start developing

### With VS Code
Expand Down
82 changes: 82 additions & 0 deletions ai_data_preprocessing_queue/Steps/remove_signature.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import re
from typing import Any


def remove_newline(text: str) -> str:
"""Remove excessive newlines or spaces from the text."""
pattern = re.compile(r"\s{2,}|[\n\r]{3,}")
result = pattern.sub(" ", text)
result = re.sub(r"\s+", " ", result).strip()

return result


GreetingExpressions = ["sincerely", "best regards", "happy holidays", "kind regards", "warm regards", "cheers",
"regards", "mit freundlichen grüßen", "freundliche grüße", "beste grüße", "viele grüße",
"herzliche grüße", "liebe grüße", "mit freundlichen grüssen", "freundliche grüsse",
"beste grüsse", "viele grüsse", "herzliche grüsse", "liebe grüsse"]
greetings_regex = r"(" + "|".join(GreetingExpressions) + r")\s*,?\s*"


def remove_greetings_and_following_text(text: str) -> str:
pattern = greetings_regex + ".*"
return re.sub(pattern, "", text, flags=re.IGNORECASE | re.UNICODE | re.DOTALL).strip()


# thank you expressions should be removed after greetings and following signature text,
# as they often appear at the beginning of a message
THANK_EXPRESSIONS = [
r"thank you(?: very much)?", # thank you, thank you very much
r"thankyou(?: very much)?", # thankyou, thankyou very much
r"thanks(?: a lot| again)?", # thanks, thanks a lot, thanks again
r"many thanks", # many thanks
r"a thousand thanks", # a thousand thanks
r"danke(?: schön)?", # danke, danke schön, danke und
r"vielen dank", # vielen dank
r"dankeschön", # dankeschön
r"besten dank" # besten dank
]

# Suffixes which could follow thank you expressions
THANK_SUFFIXES = [
r"(?:in advance(?: for (?:your|the) (?:help|support|understanding|assistance))?)",
r"(?:for (?:your|the) (?:help|support|understanding|assistance))",
r"(?:schon mal\s+)?(?:im voraus\s+)?für\s+(?:ihre|ihr|eure|die|den)\s+(?:hilfe|support|verständnis)",
r"vorab",
r"kindly?"
]

# Combine them into a final regex pattern and compile
thank_expressions = r"|".join(THANK_EXPRESSIONS)
suffixes = r"(?:\s+(?:" + r"|".join(THANK_SUFFIXES) + r"))?"
final_pattern = (
r"\b(?:" + thank_expressions + r")" + suffixes + r"\s*(?:,|\.|!|;)?\s*"
)
thanking_regex = re.compile(final_pattern, flags=re.IGNORECASE | re.UNICODE)


def remove_thanking_expressions(text: str) -> str:
return thanking_regex.sub("", text)


# In the end, single greetings are removed again, which could not
# be reliably removed by the preceding expressions
single_greeting_words = ["liebe grüße", "liebe grüsse", "grüße", "grüsse", "gruß", "gruss"]
single_greetings_pattern = r"\b(?:{})\b".format("|".join(single_greeting_words))


def remove_single_greeting_words(text: str, pattern: str) -> str:
return re.sub(pattern, " ", text, flags=re.IGNORECASE | re.UNICODE)


def step(item: Any, item_state: dict[str, Any], global_state: dict[str, Any] | None, preprocessor_data: str) -> Any:
if not item:
return item
try:
text_greetings_removed = remove_greetings_and_following_text(item)
thankyou_removed = remove_thanking_expressions(text_greetings_removed)
single_greetings_removed = remove_single_greeting_words(thankyou_removed, single_greetings_pattern)

return remove_newline(single_greetings_removed)
except Exception as e:
raise ValueError(f"An error occurred while removing signature: {e}") from e
7 changes: 4 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "ai-data-preprocessing-queue"
version = "1.6.0"
version = "1.7.0"
description = "A collection of different text processing steps that can be enabled or disabled dynamically."
authors = ["KI-Team"]
license = "MIT"
Expand All @@ -10,15 +10,16 @@ readme = "README.md"
python = "^3.12"
langdetect = "*"
nltk = "*"
pandas = "*"
numpy = "*"
pandas = "*"

[tool.poetry.group.dev.dependencies]
build = "*"
coverage-lcov = "*"
flake8-bandit = "*"
flake8-pydocstyle = "*"
mypy = "*"
build = "*"
parameterized = "*"
pytest = "*"
pytest-cov = "*"
types-mock = "*"
Expand Down
1 change: 1 addition & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ build
flake8-bandit
flake8-pydocstyle
mypy
parameterized
pytest
pytest-cov

2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

setuptools.setup(
name="ai-data-preprocessing-queue",
version="1.6.0",
version="1.7.0",
description="Can be used to pre process data before ai processing",
long_description=LONG_DESCRIPTION,
long_description_content_type="text/markdown",
Expand Down
153 changes: 153 additions & 0 deletions tests/test_remove_signature.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
import unittest

from parameterized import parameterized
from unittest.mock import MagicMock, patch
from ai_data_preprocessing_queue.Pipeline import Pipeline
from ai_data_preprocessing_queue.Steps.remove_signature import (
step, remove_greetings_and_following_text, remove_newline)


class TestRemoveSignature(unittest.TestCase):
@parameterized.expand([ # type: ignore[misc]
(
"multiple_newlines",
"Could you please review the attached document?\n\n\nI need your feedback by Friday.",
"Could you please review the attached document? I need your feedback by Friday.",
),
(
"multiple_spaces",
"The meeting is scheduled for 3PM tomorrow.",
"The meeting is scheduled for 3PM tomorrow.",
),
(
"mixed_whitespace",
"Please find the report attached. \n\n The numbers look good \r\n\r\n for Q3!",
"Please find the report attached. The numbers look good for Q3!",
),
(
"empty_string",
"",
""
),
(
"trailing_whitespace",
"I'll send the updated version tomorrow. \n\n ",
"I'll send the updated version tomorrow."
)
])
def test_remove_newline(self, name: str, input_text: str, expected: str) -> None:
self.assertEqual(remove_newline(input_text), expected)

@parameterized.expand([ # type: ignore[misc]
(
"english_signature_basic",
"Here's the project update. Sincerely, John Smith\nProject Manager",
"Here's the project update."
),
(
"english_signature_with_content",
"Please review the attached documents. Best regards, Jane Doe\nSenior Developer\nTech Department",
"Please review the attached documents."
),
(
"english_signature_with_content_and_several_newlines",
"Please review the attached documents. Best regards,\nJane Doe\n\nSenior Developer\n\nTech Department",
"Please review the attached documents."
),
(
"german_signature",
"Die Unterlagen wurden aktualisiert. Mit freundlichen Grüßen, Hans Schmidt\nPhone: +49 123 456789",
"Die Unterlagen wurden aktualisiert."
),
(
"greeting_with_comma",
"Meeting is scheduled for tomorrow. Kind regards, Sarah",
"Meeting is scheduled for tomorrow."
),
(
"mixed_case_greeting",
"Report is ready. BEST REGARDS, Tom Wilson",
"Report is ready."
),
(
"multiple_greetings",
"Hello team, here's the update. Best regards, Jim\nRegards, HR Team",
"Hello team, here's the update."
),
(
"empty_string",
"",
""
),
(
"no_greetings",
"This is a plain text without any greetings or signatures.",
"This is a plain text without any greetings or signatures."
),
])
def test_remove_greetings_and_following_text(self, name: str, input_text: str, expected: str) -> None:
self.assertEqual(remove_greetings_and_following_text(input_text), expected)

@parameterized.expand([ # type: ignore[misc]
(
"remove_signature_basic",
"We're sending the final draft for review. Best regards, Alice Johnson\nProject Lead",
"We're sending the final draft for review.",
),
(
"remove_signature_extended",
"Order Mice/keyboard\nGoodmorning, Can you please order the following: 10 x Dell Laser Mouse IL3220 "
"10 x Dell Business Keyboard AB322 (UK layout) Thx Best regards Jimmy B. "
"| C Facilities & Reception Klaus+Andreas Nederland | Anonymstraat 47 | 1234 AJ Amsterdam | Netherlands "
"Phone: +01 23 695 4567 | Mobile: +97 65 445 1234 | Fax: +31 35 695 8825 jim.anonymus@company.com "
"| www.nl.somecompany.com",
"Order Mice/keyboard Goodmorning, Can you please order the following: 10 x Dell Laser Mouse IL3220 "
"10 x Dell Business Keyboard AB322 (UK layout) Thx",
),
(
"thanking_at_start",
"Thank you very much for your support. "
"I will prepare the contract and send it tomorrow.\n\nBest regards, Bob Brown",
"I will prepare the contract and send it tomorrow.",
),
(
"thanking_in_middle",
"Thank you very much for your support. "
"I appreciate your support on this migration. Thanks a lot, I will share the logs shortly.",
"I appreciate your support on this migration. I will share the logs shortly.",
),
(
"single_greeting_word_german",
"The deliverables are ready. Grüße",
"The deliverables are ready.",
),
(
"german_empty_result",
"Vielen Dank für Ihre Hilfe. Mit freundlichen Grüßen, Lena Meyer "
"Und hier kommt noch mehr Text.",
"",
),
(
"no_change",
"Please schedule the kickoff meeting for next Tuesday morning at 10:00.",
"Please schedule the kickoff meeting for next Tuesday morning at 10:00.",
),
])
def test_remove_signature(self, name: str, input_text: str, expected: str) -> None:
pipeline = Pipeline({"remove_signature": None})
value = pipeline.consume(input_text)
self.assertEqual(expected, value)

def test_remove_signature_step_empty_item(self) -> None:
result = step("", {}, None, "")
self.assertEqual(result, "")

@patch("ai_data_preprocessing_queue.Steps.remove_signature.remove_greetings_and_following_text",
side_effect=Exception("Test error"))
def test_remove_signature_step_error(self, _: MagicMock) -> None:
with self.assertRaises(Exception):
step("Please schedule the kickoff meeting for next Tuesday morning at 10:00.", {}, None, "")


if __name__ == "__main__":
unittest.main()