-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdebug_scanner.py
More file actions
222 lines (179 loc) · 8.57 KB
/
debug_scanner.py
File metadata and controls
222 lines (179 loc) · 8.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
import re
from pathlib import Path
import sys
# Mocking the Scanner class to test the logic
class ScannerDebug:
def __init__(self):
self.subtitle_extensions = {'.srt', '.ass', '.ssa', '.sub', '.idx', '.vtt', '.sup', '.stl', '.smi', '.txt'}
def _normalize_name(self, name):
"""Normalize filename for comparison."""
name = Path(name).stem
name = name.lower()
name = re.sub(r'[_\-\.\[\]\(\)\{\}]', ' ', name)
name = re.sub(r'\s+', ' ', name).strip()
return name
def _extract_episode_number(self, name):
"""Extract episode/lesson number from filename."""
name = Path(name).stem
patterns = [
r'(?:episode|ep|e|урок|lesson|part|часть|глава|chapter|ch)[\s\.\-_]*(\d+)',
r'^(\d+)[\s\.\-_]',
r'[\s\.\-_](\d+)[\s\.\-_]',
r'[\s\.\-_](\d+)$',
r's\d+e(\d+)',
]
for pattern in patterns:
match = re.search(pattern, name, re.IGNORECASE)
if match:
return int(match.group(1))
return None
def _calculate_match_score(self, video_name, audio_name):
"""Calculate match score of external audio to video."""
score = 0
video_norm = self._normalize_name(video_name)
audio_norm = self._normalize_name(audio_name)
print(f"Comparing '{video_norm}' with '{audio_norm}'")
# Exact name match
if video_norm == audio_norm:
score += 100
else:
# One name contains the other
if video_norm in audio_norm or audio_norm in video_norm:
score += 50
# Common prefix
min_len = min(len(video_norm), len(audio_norm))
if min_len > 0:
common_prefix = 0
for i in range(min_len):
if video_norm[i] == audio_norm[i]:
common_prefix += 1
else:
break
score += int(common_prefix / min_len * 30)
# Episode number match
video_ep = self._extract_episode_number(video_name)
audio_ep = self._extract_episode_number(audio_name)
if video_ep is not None and audio_ep is not None:
if video_ep == audio_ep:
score += 40
# Language tags (copied from scanner.py _calculate_match_score, though not used for subs there directly?)
# Wait, the method in scanner.py line 737 checks for audio language tags.
# But _find_external_subtitles calls _calculate_match_score.
return score
def _find_external_subtitles(self, video_file, folder):
"""Find external subtitle files matching video file. (Exact copy from scanner.py with print debugging)"""
external_subtitles = []
print(f"\nScanning for subtitles for: {video_file.name}")
try:
# Mocking folder.iterdir() by checking our mock file system list if provided,
# effectively we assume 'folder' is a list of Path objects for this test
subtitle_files = [
f for f in folder
if f.is_file() and f.suffix.lower() in self.subtitle_extensions
]
print(f"Found subtitle files in folder: {[f.name for f in subtitle_files]}")
if not subtitle_files:
return []
video_name = video_file.name
video_stem = video_file.stem.lower()
for sub_file in subtitle_files:
sub_name = sub_file.name
sub_stem = sub_file.stem.lower()
print(f"Checking subtitle: {sub_name}")
match_score = self._calculate_match_score(video_name, sub_name)
print(f"Initial match score: {match_score}")
# Also check exact name match (video.ru.srt -> video.mp4)
if sub_stem.startswith(video_stem):
match_score = max(match_score, 80)
print(f"Boosted score (startswith): {match_score}")
# Minimum match threshold
if match_score >= 30:
# Determine language from filename
language = None
# COPIED REGEX FROM SCANNER.PY
lang_patterns = [
(r'(?:^|[\[\(._\-])(rus|russian|ru|рус)(?:$|[\]\)._\-])', 'ru'),
(r'(?:^|[\[\(._\-])(eng|english|en|англ)(?:$|[\]\)._\-])', 'en'),
(r'(?:^|[\[\(._\-])(ukr|ukrainian|ua|укр)(?:$|[\]\)._\-])', 'uk'),
(r'(?:^|[\[\(._\-])(jpn|japanese|ja|jp|яп)(?:$|[\]\)._\-])', 'ja'),
(r'(?:^|[\[\(._\-])(ger|german|de|deu|нем)(?:$|[\]\)._\-])', 'de'),
(r'(?:^|[\[\(._\-])(fra|french|fr|фр)(?:$|[\]\)._\-])', 'fr'),
(r'(?:^|[\[\(._\-])(spa|spanish|es|исп)(?:$|[\]\)._\-])', 'es'),
(r'(?:^|[\[\(._\-])(chi|chinese|zh|кит)(?:$|[\]\)._\-])', 'zh'),
]
for pattern, lang in lang_patterns:
if re.search(pattern, sub_name, re.IGNORECASE):
language = lang
print(f"Found language: {lang} with pattern {pattern}")
break
# Determine if subtitles are forced
is_forced = 0
if re.search(r'(?:^|[\[\(._\-])forced(?:$|[\]\)._\-])', sub_name, re.IGNORECASE):
is_forced = 1
# Codec by extension
ext_to_codec = {
'.srt': 'subrip',
'.ass': 'ass',
'.ssa': 'ass',
'.sub': 'subviewer',
'.vtt': 'webvtt',
'.sup': 'hdmv_pgs_subtitle',
'.stl': 'stl',
'.smi': 'sami',
}
codec = ext_to_codec.get(sub_file.suffix.lower(), sub_file.suffix[1:])
external_subtitles.append({
'track_type': 'external',
'stream_index': None,
'subtitle_file_path': str(sub_file),
'subtitle_file_name': sub_name,
'language': language,
'title': sub_stem,
'codec': codec,
'format': sub_file.suffix[1:].upper(),
'is_default': 0,
'is_forced': is_forced,
'match_score': match_score
})
# Sort by relevance
external_subtitles.sort(key=lambda x: x['match_score'], reverse=True)
except Exception as e:
print(f"EXCEPTION: {e}")
import traceback
traceback.print_exc()
pass
return external_subtitles
# Test execution
def run_test():
scanner = ScannerDebug()
# Create dummy files
video_file = Path("C:/Movies/My Video.mp4")
folder_files = [
Path("C:/Movies/My Video.srt"),
Path("C:/Movies/My Video.en.srt"),
Path("C:/Movies/My Video.ru.srt"),
Path("C:/Movies/Other.srt"),
Path("C:/Movies/My Video (eng).srt")
]
# Mocking is_file() to always return True for our test paths within the function logic
# But since I copied the logic which calls f.is_file(), I need to make sure my objects have it or I mock it.
# Since they are Path objects, is_file() checks real FS.
# So I should mock the objects.
class MockPath:
def __init__(self, path):
self.path = path
self.name = Path(path).name
self.stem = Path(path).stem
self.suffix = Path(path).suffix
def is_file(self):
return True
def __str__(self):
return str(self.path)
video_mock = MockPath("C:/Movies/My Video.mp4")
folder_mocks = [MockPath(p) for p in folder_files]
results = scanner._find_external_subtitles(video_mock, folder_mocks)
print("\nResults:")
for res in results:
print(f"Found: {res['subtitle_file_name']} (Lang: {res['language']}, Score: {res['match_score']})")
if __name__ == "__main__":
run_test()