-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgenerate_test_data.py
More file actions
151 lines (132 loc) · 5.98 KB
/
generate_test_data.py
File metadata and controls
151 lines (132 loc) · 5.98 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import argparse
from pathlib import Path
# Parse args and optionally reset the DB before importing translator.
# Importing translator starts a background thread that opens a DB connection,
# so the reset must happen first to avoid a concurrent CREATE TABLE conflict.
parser = argparse.ArgumentParser(description="Generate test translation data.")
parser.add_argument(
"--reset",
action="store_true",
help="Truncate translation_usage before inserting (avoids duplicate rows on re-runs).",
)
args = parser.parse_args()
if args.reset:
from db import get_connection, init_db
conn = get_connection()
init_db(conn)
conn.execute("DELETE FROM translation_usage")
conn.close()
print("Database reset.\n")
import deepl
import translator
from translator import translate, translate_document, query_by_language_pair
# ── Ensure test_docs directories exist ────────────────────────────────────────
SCRIPT_DIR = Path(__file__).parent
DOCS_DIR = SCRIPT_DIR / "test_docs"
OUTPUT_DIR = DOCS_DIR / "output"
DOCS_DIR.mkdir(exist_ok=True)
OUTPUT_DIR.mkdir(exist_ok=True)
demo_txt = DOCS_DIR / "demo.txt"
if not demo_txt.exists():
demo_txt.write_text("This is a DeepL demo document.")
demo_html = DOCS_DIR / "demo.html"
if not demo_html.exists():
demo_html.write_text(
"<!DOCTYPE html><html><body><p>This is a DeepL demo document.</p></body></html>"
)
demo_srt = DOCS_DIR / "demo.srt"
if not demo_srt.exists():
demo_srt.write_text(
"1\n00:00:01,000 --> 00:00:04,000\nThis is a DeepL demo document.\n"
)
# Minimal SVG used to trigger the "unsupported document type" error case.
demo_svg = DOCS_DIR / "demo.svg"
if not demo_svg.exists():
demo_svg.write_text('<svg xmlns="http://www.w3.org/2000/svg"><text y="20">DeepL</text></svg>')
# ── Text translation test cases ───────────────────────────────────────────────
test_cases = [
# (texts, target_lang, reporting_tag)
(["Guten Morgen!", "Wie geht es Ihnen?"], "EN-US", "team-id-1"),
(["Bonjour tout le monde.", "Merci beaucoup."], "EN-US", "team-id-1"),
(["Hello, how are you?", "Good morning!"], "DE", "team-id-2"),
(["The weather is nice today."], "FR", "team-id-2"),
(["Gracias por su ayuda."], "EN-US", "team-id-3"),
(["Ich liebe Berlin.", "Das ist sehr gut."], "ES", "team-id-3"),
(["今日はいい天気ですね。"], "EN-US", None),
(["Please translate this document carefully."], "DE", "team-id-1"),
]
print("Sending text translations...\n")
for texts, target_lang, tag in test_cases:
results = translate(texts, target_lang=target_lang, reporting_tag=tag)
tag_label = tag or "(no tag)"
for text, result in zip(texts, results):
print(f" [{tag_label}] {text!r} → {result.text!r} ({result.billed_characters} chars, {result.detected_source_lang}→{target_lang})")
# ── Document translation test cases ───────────────────────────────────────────
doc_cases = [
# (input_path, target_lang, reporting_tag)
(demo_txt, "DE", "team-id-1"),
(demo_txt, "FR", "team-id-2"),
(demo_txt, "JA", "team-id-1"),
(demo_html, "DE", "team-id-2"),
(demo_html, "FR", "team-id-3"),
(demo_html, "JA", "team-id-3"),
(demo_srt, "DE", "team-id-1"),
(demo_srt, "FR", "team-id-2"),
(demo_srt, "JA", "team-id-3"),
]
print("\nSending document translations...\n")
for input_path, target_lang, tag in doc_cases:
stem = input_path.stem
ext = input_path.suffix
output_path = OUTPUT_DIR / f"{stem}_{target_lang}{ext}"
translate_document(
str(input_path),
str(output_path),
target_lang=target_lang,
reporting_tag=tag,
)
tag_label = tag or "(no tag)"
print(f" [{tag_label}] {input_path.name} → {output_path.name} (target: {target_lang})")
# ── Real error cases ───────────────────────────────────────────────────────────
print("\nGenerating real errors...\n")
# 1. Bad API key → expect 403 AuthorizationException
real_client = translator._client
translator._client = deepl.DeepLClient("my-api-key")
try:
translate(["Hello world"], target_lang="DE", reporting_tag="team-id-1")
except Exception as e:
print(f" [error 1 - bad key] {type(e).__name__} (HTTP {getattr(e, 'http_status_code', '?')}): {e}")
finally:
translator._client = real_client
# 2. Unsupported target language (Klingon) → expect 400
try:
translate(["Hello world"], target_lang="tlh", reporting_tag="team-id-2")
except Exception as e:
print(f" [error 2 - klingon] {type(e).__name__} (HTTP {getattr(e, 'http_status_code', '?')}): {e}")
# 3. Document translation with same source and target language (EN → EN-US)
try:
translate_document(
str(demo_txt),
str(OUTPUT_DIR / "error_same_lang.txt"),
target_lang="EN-US",
source_lang="EN",
reporting_tag="team-id-3",
)
except Exception as e:
print(f" [error 3 - same lang] {type(e).__name__} (HTTP {getattr(e, 'http_status_code', '?')}): {e}")
# 4. Unsupported document type (SVG)
try:
translate_document(
str(demo_svg),
str(OUTPUT_DIR / "demo_DE.svg"),
target_lang="DE",
reporting_tag="team-id-1",
)
except Exception as e:
print(f" [error 4 - svg] {type(e).__name__} (HTTP {getattr(e, 'http_status_code', '?')}): {e}")
# Wait for the log worker to flush all queued entries before exiting
translator._log_queue.join()
# ── Usage summary ─────────────────────────────────────────────────────────────
print("\n--- Usage by language pair ---\n")
df = query_by_language_pair()
print(df.to_string(index=False))