Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 46 additions & 1 deletion dateparser/languages/locale.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,8 +152,53 @@ def translate(self, date_string, keep_formatting=False, settings=None):
date_string_tokens[i] = dictionary[word] or fallback
if "in" in date_string_tokens:
date_string_tokens = self._clear_future_words(date_string_tokens)

# Remove empty tokens (skip words) and handle adjacent whitespace
# When a skip token is removed between spaces, keep the maximum number of spaces
filtered_tokens = []
i = 0
while i < len(date_string_tokens):
token = date_string_tokens[i]

# Skip empty tokens (removed skip words)
if not token:
# Count preceding spaces already in filtered_tokens
prev_spaces = 0
j = len(filtered_tokens) - 1
while j >= 0 and filtered_tokens[j] == " ":
prev_spaces += 1
j -= 1

# Count following spaces in the remaining tokens
next_spaces = 0
j = i + 1
while j < len(date_string_tokens) and date_string_tokens[j] == " ":
next_spaces += 1
j += 1

# If surrounded by spaces, keep max(prev_spaces, next_spaces)
if prev_spaces > 0 and next_spaces > 0:
# Remove prev_spaces from filtered_tokens
for _ in range(prev_spaces):
filtered_tokens.pop()

# Add back the maximum number of spaces
max_spaces = max(prev_spaces, next_spaces)
for _ in range(max_spaces):
filtered_tokens.append(" ")

# Skip the empty token and all following spaces
i += next_spaces + 1
continue

i += 1
continue

filtered_tokens.append(token)
i += 1

return self._join(
list(filter(bool, date_string_tokens)),
filtered_tokens,
separator="" if keep_formatting else " ",
settings=settings,
)
Comment on lines +156 to 204
Copy link

Copilot AI Apr 9, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Locale.translate now performs a custom pass over tokens to merge spaces around removed skip tokens. Given how central translate() is, please add/extend tests that exercise this behavior with keep_formatting=True as well (where separator="" is used) to ensure the change doesn't introduce spacing regressions in formatting-preserving mode.

Copilot uses AI. Check for mistakes.
Expand Down
40 changes: 40 additions & 0 deletions test_whitespace.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#!/usr/bin/env python3
"""Test script to check current whitespace behavior"""

from dateparser.languages import default_loader
from dateparser.conf import settings


def test_current_behavior():
locale = default_loader.get_locale("fi")

# Test 1: Single spaces
input1 = "28 maalis klo 9:37"
result1 = locale.translate(input1, settings=settings)
print(f"Input1: |{input1}|")
print(f"Result1: |{result1}|")
print("Expected1: |28 march 9:37|")
print(f"Match: {result1 == '28 march 9:37'}")
print()

# Test 2: Double spaces
input2 = "28 maalis klo 9:37"
result2 = locale.translate(input2, settings=settings)
print(f"Input2: |{input2}|")
print(f"Result2: |{result2}|")
print("Expected2: |28 march 9:37|")
print(f"Match: {result2 == '28 march 9:37'}")
print()

# Test 3: Triple spaces
input3 = "28 maalis klo 9:37"
result3 = locale.translate(input3, settings=settings)
print(f"Input3: |{input3}|")
print(f"Result3: |{result3}|")
print("Expected3: |28 march 9:37|")
print(f"Match: {result3 == '28 march 9:37'}")
print()
Comment on lines +14 to +36
Copy link

Copilot AI Apr 9, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This file will be collected by pytest (it matches the default test discovery pattern) but it only prints results and never asserts, so it won't fail even if whitespace handling regresses and it may add noisy output in CI. Consider removing this script from the repository, or converting it into a proper test under tests/ with assertions.

Suggested change
print(f"Input1: |{input1}|")
print(f"Result1: |{result1}|")
print("Expected1: |28 march 9:37|")
print(f"Match: {result1 == '28 march 9:37'}")
print()
# Test 2: Double spaces
input2 = "28 maalis klo 9:37"
result2 = locale.translate(input2, settings=settings)
print(f"Input2: |{input2}|")
print(f"Result2: |{result2}|")
print("Expected2: |28 march 9:37|")
print(f"Match: {result2 == '28 march 9:37'}")
print()
# Test 3: Triple spaces
input3 = "28 maalis klo 9:37"
result3 = locale.translate(input3, settings=settings)
print(f"Input3: |{input3}|")
print(f"Result3: |{result3}|")
print("Expected3: |28 march 9:37|")
print(f"Match: {result3 == '28 march 9:37'}")
print()
assert result1 == "28 march 9:37"
# Test 2: Double spaces
input2 = "28 maalis klo 9:37"
result2 = locale.translate(input2, settings=settings)
assert result2 == "28 march 9:37"
# Test 3: Triple spaces
input3 = "28 maalis klo 9:37"
result3 = locale.translate(input3, settings=settings)
assert result3 == "28 march 9:37"

Copilot uses AI. Check for mistakes.


if __name__ == "__main__":
test_current_behavior()
72 changes: 44 additions & 28 deletions tests/test_languages.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def setUp(self):
# French
param("fr", "20 Février 2012", "20 february 2012"),
param("fr", "Mercredi 19 Novembre 2013", "wednesday 19 november 2013"),
param("fr", "18 octobre 2012 à 19 h 21 min", "18 october 2012 19:21"),
param("fr", "18 octobre 2012 à 19 h 21 min", "18 october 2012 19:21"),
# German
param("de", "29. Juni 2007", "29. june 2007"),
param("de", "Montag 5 Januar, 2015", "monday 5 january 2015"),
Expand Down Expand Up @@ -109,49 +109,57 @@ def setUp(self):
param("it", "Giovedi Maggio 29 2013", "thursday may 29 2013"),
param("it", "19 Luglio 2013", "19 july 2013"),
# Portuguese
param("pt", "22 de dezembro de 2014 às 02:38", "22 december 2014 02:38"),
param("pt", "22 de dezembro de 2014 às 02:38", "22 december 2014 02:38"),
# Russian
param("ru", "5 августа 2014 г. в 12:00", "5 august 2014 year. 12:00"),
param("ru", "5 августа 2014 г. в 12:00", "5 august 2014 year. 12:00"),
# Turkish
param("tr", "2 Ocak 2015 Cuma, 16:49", "2 january 2015 friday 16:49"),
# Czech
param("cs", "22. prosinec 2014 v 2:38", "22. december 2014 2:38"),
param(
"cs", "22. prosinec 2014 v 2:38", "22. december 2014 2:38"
), # Issue #1302: v becomes in, (cleared, leaves 2 spaces)
Comment on lines +119 to +120
Copy link

Copilot AI Apr 9, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The Czech test case still expects a double space between "2014" and "2:38" after removing the skip token "v". With the new whitespace-merging logic in Locale.translate, a skip token between single spaces should yield a single space (max(1,1)=1). Update this expected translation (or adjust the test input) to match the intended behavior for issue #1302.

Suggested change
"cs", "22. prosinec 2014 v 2:38", "22. december 2014 2:38"
), # Issue #1302: v becomes in, (cleared, leaves 2 spaces)
"cs", "22. prosinec 2014 v 2:38", "22. december 2014 2:38"
), # Issue #1302: skip token "v" is removed and surrounding spaces are merged

Copilot uses AI. Check for mistakes.
# Dutch
param(
"nl",
"maandag 22 december 2014 om 2:38",
"monday 22 december 2014 2:38",
"monday 22 december 2014 2:38",
),
# Romanian
param("ro", "22 Decembrie 2014 la 02:38", "22 december 2014 02:38"),
param("ro", "22 Decembrie 2014 la 02:38", "22 december 2014 02:38"),
# Polish
param("pl", "4 stycznia o 13:50", "4 january 13:50"),
param("pl", "29 listopada 2014 o 08:40", "29 november 2014 08:40"),
param("pl", "4 stycznia o 13:50", "4 january 13:50"),
param(
"pl", "29 listopada 2014 o 08:40", "29 november 2014 08:40"
), # Issue #1302: "o" removed, whitespace preserved
# Ukrainian
param("uk", "30 листопада 2013 о 04:27", "30 november 2013 04:27"),
param("uk", "22 верес 2021 о 07:37", "22 september 2021 07:37"),
param("uk", "28 лютого 2020 року об 11:57", "28 february 2020 year 11:57"),
param("uk", "30 листопада 2013 о 04:27", "30 november 2013 04:27"),
param("uk", "22 верес 2021 о 07:37", "22 september 2021 07:37"),
param("uk", "28 лютого 2020 року об 11:57", "28 february 2020 year 11:57"),
param(
"uk",
"середу, 28 лютого 2020 року об 11:57",
"wednesday 28 february 2020 year 11:57",
"wednesday 28 february 2020 year 11:57",
),
param(
"uk",
"понед, 12 вересня 2022 року об 09:22",
"monday 12 september 2022 year 09:22",
"monday 12 september 2022 year 09:22",
),
# Belarusian
param("be", "5 снежня 2015 г. у 12:00", "5 december 2015 year. 12:00"),
param("be", "11 верасня 2015 г. у 12:11", "11 september 2015 year. 12:11"),
param("be", "3 стд 2015 г. у 10:33", "3 january 2015 year. 10:33"),
param("be", "5 снежня 2015 г. у 12:00", "5 december 2015 year. 12:00"),
param("be", "11 верасня 2015 г. у 12:11", "11 september 2015 year. 12:11"),
param("be", "3 стд 2015 г. у 10:33", "3 january 2015 year. 10:33"),
# Arabic
param("ar", "6 يناير، 2015، الساعة 05:16 مساءً", "6 january 2015 05:16 pm"),
param("ar", "7 يناير، 2015، الساعة 11:00 صباحاً", "7 january 2015 11:00 am"),
# Vietnamese
param("vi", "Thứ Năm, ngày 8 tháng 1 năm 2015", "thursday 8 january 2015"),
param("vi", "Thứ Tư, 07/01/2015 | 22:34", "wednesday 07/01/2015 22:34"),
param("vi", "9 Tháng 1 2015 lúc 15:08", "9 january 2015 15:08"),
param(
"vi", "Thứ Tư, 07/01/2015 | 22:34", "wednesday 07/01/2015 22:34"
), # Pipe between spaces preserved
param(
"vi", "9 Tháng 1 2015 lúc 15:08", "9 january 2015 15:08"
), # Issue #1302: "lúc" removed, whitespace preserved
# Thai
param(
"th",
Expand Down Expand Up @@ -184,11 +192,13 @@ def setUp(self):
param("en", "2014-12-12T12:33:39-08:00", "2014-12-12 12:33:39-08:00"),
param("en", "2014-10-15T16:12:20+00:00", "2014-10-15 16:12:20+00:00"),
param("en", "28 Oct 2014 16:39:01 +0000", "28 october 2014 16:39:01 +0000"),
param("es", "13 Febrero 2015 a las 23:00", "13 february 2015 23:00"),
param("es", "13 Febrero 2015 a las 23:00", "13 february 2015 23:00"),
# Danish
param("da", "Sep 03 2014", "september 03 2014"),
param("da", "fredag, 03 september 2014", "friday 03 september 2014"),
param("da", "fredag d. 3 september 2014", "friday 3 september 2014"),
param(
"da", "fredag d. 3 september 2014", "friday 3 september 2014"
), # Issue #1302: 'd.' removed, whitespace preserved
# Finnish
param("fi", "maanantai tammikuu 16, 2015", "monday january 16 2015"),
param("fi", "ma tammi 16, 2015", "monday january 16 2015"),
Expand Down Expand Up @@ -216,7 +226,9 @@ def setUp(self):
param("fi", "su joulu 16, 2015", "sunday december 16 2015"),
param("fi", "1. tammikuuta, 2016", "1. january 2016"),
param("fi", "tiistaina, 27. lokakuuta 2015", "tuesday 27. october 2015"),
param("fi", "28 maalis klo 9:37", "28 march 9:37"),
param(
"fi", "28 maalis klo 9:37", "28 march 9:37"
), # Issue #1302: preserve single space
# Japanese
param("ja", "午後3時", "pm 3:00"),
param("ja", "2時", "2:00"),
Expand All @@ -239,7 +251,7 @@ def setUp(self):
# Hebrew
param("he", "20 לאפריל 2012", "20 april 2012"),
param("he", "יום רביעי ה-19 בנובמבר 2013", "wednesday 19 november 2013"),
param("he", "18 לאוקטובר 2012 בשעה 19:21", "18 october 2012 19:21"),
param("he", "18 לאוקטובר 2012 בשעה 19:21", "18 october 2012 19:21"),
param("he", "יום ה' 6/10/2016", "thursday 6/10/2016"),
param("he", "חצות", "12 am"),
param("he", "1 אחר חצות", "1 am"),
Expand Down Expand Up @@ -1210,7 +1222,9 @@ def test_translation(self, shortname, datetime_string, expected_translation):
param("ar", "اليوم", "0 day ago"),
# Polish
param("pl", "2 godz.", "2 hour."),
param("pl", "Wczoraj o 07:40", "1 day ago 07:40"),
param(
"pl", "Wczoraj o 07:40", "1 day ago 07:40"
), # Issue #1302: fixed double space
# Vietnamese
param("vi", "2 tuần 3 ngày", "2 week 3 day"),
param("vi", "21 giờ trước", "21 hour ago"),
Expand Down Expand Up @@ -1249,10 +1263,10 @@ def test_translation(self, shortname, datetime_string, expected_translation):
param("id", "hari ini", "0 day ago"),
param("id", "kemarin", "1 day ago"),
param("id", "kemarin lusa", "2 day ago"),
param("id", "sehari yang lalu", "1 day ago"),
param("id", "seminggu yang lalu", "1 week ago"),
param("id", "sebulan yang lalu", "1 month ago"),
param("id", "setahun yang lalu", "1 year ago"),
param("id", "sehari yang lalu", "1 day ago"),
param("id", "seminggu yang lalu", "1 week ago"),
param("id", "sebulan yang lalu", "1 month ago"),
param("id", "setahun yang lalu", "1 year ago"),
# Finnish
param("fi", "1 vuosi sitten", "1 year ago"),
param("fi", "2 vuotta sitten", "2 year ago"),
Expand Down Expand Up @@ -1320,7 +1334,9 @@ def test_translation(self, shortname, datetime_string, expected_translation):
param("ja", "明後日", "in 2 day"),
# Hebrew
param("he", "אתמול", "1 day ago"),
param("he", "אתמול בשעה 3", "1 day ago 3"),
param(
"he", "אתמול בשעה 3", "1 day ago 3"
), # Issue #1302: "בשעה" removed, whitespace preserved
param("he", "היום", "0 day ago"),
param("he", "לפני יומיים", "2 day ago"),
param("he", "לפני שבועיים", "2 week ago"),
Expand Down
69 changes: 69 additions & 0 deletions tests/test_whitespace_preservation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
"""
Tests for issue #1302: Whitespace preservation during translation
"""

from parameterized import param, parameterized

from dateparser.conf import settings
from dateparser.languages import default_loader
from tests import BaseTestCase


class TestWhitespacePreservation(BaseTestCase):
"""
Tests to ensure that whitespace is preserved exactly when translating
date strings, even when tokens are removed from the skip list (e.g., "klo" in Finnish).

Issue #1302: Extra whitespace handling during date translation
"""

def setUp(self):
super().setUp()
self.language = NotImplemented
self.datetime_string = NotImplemented
self.translation = NotImplemented
self.settings = settings

@parameterized.expand(
[
# Single space preservation - Finnish
param("fi", "28 maalis klo 9:37", "28 march 9:37"),
# Double space preservation - Finnish
param("fi", "28 maalis klo 9:37", "28 march 9:37"),
# Triple space preservation - Finnish
param("fi", "28 maalis klo 9:37", "28 march 9:37"),
# Mixed whitespace - Finnish
param("fi", "28 maalis klo 9:37", "28 march 9:37"),
# More complex Finnish date with whitespace
param(
"fi", "tiistaina 27. lokakuuta 2015", "tuesday 27. october 2015"
),
]
)
def test_whitespace_preservation_during_translation(
self, shortname, datetime_string, expected_translation
):
"""Test that exact whitespace is preserved when translating date strings."""
self.given_bundled_language(shortname)
self.given_string(datetime_string)
self.when_datetime_string_translated()
self.then_string_translated_to(expected_translation)

def given_bundled_language(self, shortname):
self.language = default_loader.get_locale(shortname)

def given_string(self, datetime_string):
self.datetime_string = datetime_string

def when_datetime_string_translated(self):
self.translation = self.language.translate(
self.datetime_string, settings=self.settings
)

def then_string_translated_to(self, expected_string):
self.assertEqual(
expected_string,
self.translation,
f"\nExpected: |{expected_string}|\nGot: |{self.translation}|\n"
f"Input: |{self.datetime_string}|",
)
Loading