Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 32 additions & 2 deletions packages/markitdown/src/markitdown/converters/_pptx_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,10 +161,11 @@ def get_shape_content(shape, **kwargs):

# Text areas
elif shape.has_text_frame:
text = self._convert_text_frame_to_markdown(shape.text_frame)
if shape == title:
md_content += "# " + shape.text.lstrip() + "\n"
md_content += "# " + text.lstrip() + "\n"
else:
md_content += shape.text + "\n"
md_content += text + "\n"

# Group Shapes
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.GROUP:
Expand Down Expand Up @@ -212,6 +213,35 @@ def _is_table(self, shape):
return True
return False

def _convert_text_frame_to_markdown(self, text_frame):
"""Convert a text frame to markdown, preserving hyperlinks and list hierarchy."""
paragraphs_md = []
for para in text_frame.paragraphs:
para_md = self._convert_paragraph_to_markdown(para)
if para_md.strip():
paragraphs_md.append(para_md)
return "\n".join(paragraphs_md)

def _convert_paragraph_to_markdown(self, paragraph):
"""Convert a single paragraph to markdown, preserving hyperlinks."""
runs_md = []
for run in paragraph.runs:
run_text = run.text
if run.hyperlink.address:
# Escape brackets in link text to avoid breaking markdown syntax
escaped_text = run_text.replace("[", "\\[").replace("]", "\\]")
run_text = f"[{escaped_text}]({run.hyperlink.address})"
runs_md.append(run_text)

text = "".join(runs_md)

# Handle list indentation based on paragraph level
if paragraph.level > 0 and text.strip():
indent = " " * paragraph.level
text = indent + "- " + text

return text

def _convert_table_to_markdown(self, table, **kwargs):
# Write the table as HTML, then convert it to Markdown
html_table = "<html><body><table>"
Expand Down
15 changes: 15 additions & 0 deletions packages/markitdown/tests/_test_vectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,21 @@ class FileTestVector(object):
],
must_not_include=[],
),
FileTestVector(
filename="test_hyperlinks.pptx",
mimetype="application/vnd.openxmlformats-officedocument.presentationml.presentation",
charset=None,
url=None,
must_include=[
"[Visit Microsoft](https://microsoft.com)",
" - Second level item",
" - Third level item",
" - [GitHub link in list](https://github.com)",
],
must_not_include=[
"Visit Microsofthttps://microsoft.com", # ensure hyperlink is formatted, not plain text concatenation
],
),
]


Expand Down
Binary file not shown.