Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions packages/markitdown/src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
EpubConverter,
DocumentIntelligenceConverter,
CsvConverter,
VttConverter,
)

from ._base_converter import DocumentConverter, DocumentConverterResult
Expand Down Expand Up @@ -202,6 +203,7 @@ def enable_builtins(self, **kwargs) -> None:
self.register_converter(OutlookMsgConverter())
self.register_converter(EpubConverter())
self.register_converter(CsvConverter())
self.register_converter(VttConverter())

# Register Document Intelligence converter at the top of the stack if endpoint is provided
docintel_endpoint = kwargs.get("docintel_endpoint")
Expand Down
2 changes: 2 additions & 0 deletions packages/markitdown/src/markitdown/converters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
)
from ._epub_converter import EpubConverter
from ._csv_converter import CsvConverter
from ._vtt_converter import VttConverter

__all__ = [
"PlainTextConverter",
Expand All @@ -45,4 +46,5 @@
"DocumentIntelligenceFileType",
"EpubConverter",
"CsvConverter",
"VttConverter",
]
174 changes: 174 additions & 0 deletions packages/markitdown/src/markitdown/converters/_vtt_converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
import re
from typing import BinaryIO, Any
from charset_normalizer import from_bytes
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
from .._exceptions import FileConversionException

ACCEPTED_MIME_TYPE_PREFIXES = [
"text/vtt",
]
ACCEPTED_FILE_EXTENSIONS = [".vtt"]

MAX_FILE_SIZE = 10 * 1024 * 1024
MAX_CUES = 10000


class VttConverter(DocumentConverter):
"""
Converts WebVTT (.vtt) subtitle files to Markdown.
Parses cue timings, speaker tags, and multi-line cues.
Outputs timestamped transcript format: [HH:MM:SS.mmm] cue text
"""

def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any,
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()

if extension in ACCEPTED_FILE_EXTENSIONS:
return True

for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return True

cur_pos = file_stream.tell()
header = file_stream.read(6)
file_stream.seek(cur_pos)
if header == b"WEBVTT":
return True

return False

def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any,
) -> DocumentConverterResult:
file_stream.seek(0, 2)
file_size = file_stream.tell()
file_stream.seek(0)

if file_size > MAX_FILE_SIZE:
raise FileConversionException(
f"VTT file too large: {file_size} bytes (max {MAX_FILE_SIZE})"
)

if stream_info.charset:
content = file_stream.read().decode(stream_info.charset)
else:
content = str(from_bytes(file_stream.read()).best())

if "\x00" in content:
raise FileConversionException("VTT file contains null bytes")

paragraphs = self._parse_vtt(content)
markdown = "\n\n".join(paragraphs)

return DocumentConverterResult(markdown=markdown)

def _parse_vtt(self, content: str) -> list[str]:
lines = content.splitlines()
paragraphs = []
cue_count = 0
in_cue = False
current_lines = []
cue_timing_pattern = re.compile(
r"^(\d{2}):(\d{2}):(\d{2})\.(\d{3})\s*-->\s*(\d{2}):(\d{2}):(\d{2})\.(\d{3})"
)

for line in lines:
stripped = line.strip()

if stripped == "WEBVTT":
continue

if not in_cue and (
stripped == ""
or stripped.startswith("NOTE")
or stripped.startswith("REGION")
):
continue

timing_match = cue_timing_pattern.match(stripped)
if timing_match:
cue_count += 1
if cue_count > MAX_CUES:
raise FileConversionException(
f"Too many cues in VTT file: {cue_count} (max {MAX_CUES})"
)

(
hours,
mins,
secs,
millis,
end_hours,
end_mins,
end_secs,
end_millis,
) = map(int, timing_match.groups())

if hours > 99 or mins > 59 or secs > 59:
raise FileConversionException(
f"Invalid timestamp in VTT: {timing_match.group(0)}"
)
if end_hours > 99 or end_mins > 59 or end_secs > 59:
raise FileConversionException(
f"Invalid end timestamp in VTT: {timing_match.group(0)}"
)

start_time = f"{hours:02d}:{mins:02d}:{secs:02d}.{millis:03d}"

if current_lines:
text = " ".join(current_lines)
text = self._clean_text(text)
if text.strip():
paragraphs.append(f"[{start_time}] {text}")
current_lines = []

in_cue = True
continue

if stripped == "" and in_cue:
if current_lines:
text = " ".join(current_lines)
text = self._clean_text(text)
if text.strip():
paragraphs.append(f"[{start_time}] {text}")
current_lines = []
in_cue = False
continue

if in_cue and stripped:
current_lines.append(stripped)

if current_lines and in_cue:
text = " ".join(current_lines)
text = self._clean_text(text)
if text.strip():
paragraphs.append(f"[{start_time}] {text}")

return paragraphs

def _clean_text(self, text: str) -> str:
# Handle <v Name> - voice only
text = re.sub(r"<v\s+([^>]+)>", r"\1: ", text)
# Handle <v.class Name> - voice with class (extract just the name)
text = re.sub(r"<v\.\S+\s+([^>]+)>", r"\1: ", text)
# Handle <v.class> - voice with class only (no name)
text = re.sub(r"<v\.([^>\s]+)>", r"\1: ", text)
text = re.sub(r"</?c[^>]*>", "", text)
text = re.sub(r"</?b>", "", text)
text = re.sub(r"</?i>", "", text)
text = re.sub(r"</?u>", "", text)
text = re.sub(r"<[^>]+>", "", text)
text = re.sub(r"\s+", " ", text)

return text.strip()
17 changes: 17 additions & 0 deletions packages/markitdown/tests/_test_vectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,4 +276,21 @@ class FileTestVector(object):
"![This phrase of the caption is Human-written.](Picture4.jpg)",
],
),
FileTestVector(
filename="test.vtt",
mimetype="text/vtt",
charset="utf-8",
url=None,
must_include=[
"[00:00:01.000] Hello and welcome to this presentation.",
"[00:00:04.000] Presenter: Today we'll discuss the project roadmap.",
"[00:00:10.000] First, let me outline the key milestones and deliverables for this quarter.",
"[00:00:13.000] Any questions before we begin?",
],
must_not_include=[
"WEBVTT",
"-->",
"00:00:02",
],
),
]
17 changes: 17 additions & 0 deletions packages/markitdown/tests/test_files/test.vtt
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
WEBVTT

00:00:01.000 --> 00:00:03.000
Hello and welcome to this presentation.

00:00:04.000 --> 00:00:06.000
<v Presenter> Today we'll discuss the project roadmap.

00:00:07.000 --> 00:00:09.000
<v Analyst> I've prepared some slides.

00:00:10.000 --> 00:00:12.000
First, let me outline the key milestones
and deliverables for this quarter.

00:00:13.000 --> 00:00:15.000
Any questions before we begin?
1 change: 1 addition & 0 deletions packages/markitdown/tests/test_files/test_empty.vtt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
WEBVTT
7 changes: 7 additions & 0 deletions packages/markitdown/tests/test_files/test_invalid_ts.vtt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
WEBVTT

00:00:01.000 --> invalid-timestamp
This cue has an invalid timestamp.

99:99:99.999 --> 00:00:05.000
Another invalid timestamp.
2 changes: 2 additions & 0 deletions packages/markitdown/tests/test_files/test_no_header.vtt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
00:00:01.000 --> 00:00:03.000
This file has no WEBVTT header.
10 changes: 10 additions & 0 deletions packages/markitdown/tests/test_files/test_special_chars.vtt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
WEBVTT

00:00:01.000 --> 00:00:03.000
Text with **bold** and *italic* markdown.

00:00:04.000 --> 00:00:06.000
Text with `code` and [link](http://example.com).

00:00:07.000 --> 00:00:09.000
Text with <script>alert('xss')</script> tag.
Loading