Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions openviking/parse/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@
# Common utility functions
# ============================================================================

RESOURCE_ROOT_PLACEHOLDER = "__OPENVIKING_RESOURCE_ROOT__"
RESOURCE_ATTACHMENTS_META_KEY = "_attachments"
RESOURCE_ROOT_PLACEHOLDER_META_KEY = "resource_root_placeholder"


def calculate_media_strategy(image_count: int, line_count: int) -> str:
"""
Expand Down
4 changes: 2 additions & 2 deletions openviking/parse/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@


class DocumentConverter:
"""Converts documents to PDF for consistent rendering (DOCX/MD/PPTX -> PDF)."""
"""Converts documents to PDF for consistent rendering."""

def __init__(self, temp_dir: Optional[Path] = None):
self.temp_dir = temp_dir
Expand All @@ -24,7 +24,7 @@ async def to_pdf(self, file_path: Path) -> Optional[Path]:

if ext == ".pdf":
return file_path
elif ext in (".docx", ".pptx"):
elif ext in (".doc", ".docx", ".pptx"):
return await self._convert_with_libreoffice(file_path)
elif ext in (".md", ".markdown"):
return await self._convert_markdown_to_pdf(file_path)
Expand Down
81 changes: 81 additions & 0 deletions openviking/parse/parsers/image_attachments.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
# SPDX-License-Identifier: AGPL-3.0
"""Helpers for parser-produced image attachments."""

from __future__ import annotations

from pathlib import Path
from typing import Any, Dict, Optional

from openviking.parse.base import RESOURCE_ROOT_PLACEHOLDER

SUPPORTED_IMAGE_EXTENSIONS = frozenset({".png", ".jpg", ".jpeg", ".gif", ".bmp", ".webp", ".svg"})

_CONTENT_TYPE_EXTENSIONS = {
"image/png": ".png",
"image/jpeg": ".jpg",
"image/jpg": ".jpg",
"image/gif": ".gif",
"image/bmp": ".bmp",
"image/webp": ".webp",
"image/svg+xml": ".svg",
}


def image_media_path(index: int, extension: str) -> str:
"""Return the canonical parser-produced media path."""
return f"media/images/image-{index}{_normalize_extension(extension)}"


def markdown_image_reference(media_path: str, alt_text: str = "image") -> str:
"""Return a Markdown image reference that will be rewritten to the resource URI."""
return f"![{alt_text}]({RESOURCE_ROOT_PLACEHOLDER}/{media_path})"


def image_attachment(media_path: str, content: bytes) -> Dict[str, Any]:
"""Return the parser-produced attachment payload used by MarkdownParser."""
return {"path": media_path, "content": content}


def image_extension_from_name_type_or_data(
name: str = "",
content_type: str = "",
data: Optional[bytes] = None,
) -> str:
"""Choose a stable supported image extension from metadata or bytes."""
suffix = Path(name).suffix.lower()
if suffix in SUPPORTED_IMAGE_EXTENSIONS:
return ".jpg" if suffix == ".jpeg" else suffix

ext = _CONTENT_TYPE_EXTENSIONS.get(content_type.lower())
if ext:
return ext

if data:
return detect_image_extension(data)

return ".png"


def detect_image_extension(data: bytes) -> str:
"""Detect a supported image extension from file signatures."""
if data.startswith(b"\x89PNG\r\n\x1a\n"):
return ".png"
if data.startswith(b"\xff\xd8\xff"):
return ".jpg"
if data.startswith((b"GIF87a", b"GIF89a")):
return ".gif"
if data.startswith(b"BM"):
return ".bmp"
if data.startswith(b"RIFF") and data[8:12] == b"WEBP":
return ".webp"
if data.lstrip().startswith((b"<svg", b"<?xml")):
return ".svg"
return ".png"


def _normalize_extension(extension: str) -> str:
ext = extension.lower()
if not ext.startswith("."):
ext = f".{ext}"
return ".jpg" if ext == ".jpeg" else ext
37 changes: 36 additions & 1 deletion openviking/parse/parsers/legacy_doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from typing import List, Optional, Union

from openviking.parse.base import ParseResult
from openviking.parse.converter import DocumentConverter
from openviking.parse.parsers.base_parser import BaseParser
from openviking_cli.utils.config.parser_config import ParserConfig
from openviking_cli.utils.logger import get_logger
Expand Down Expand Up @@ -50,9 +51,16 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs)
path = Path(source)

if path.exists():
converted = await self._parse_converted_pdf(path, instruction, kwargs)
if converted:
return converted

text = await asyncio.to_thread(self._extract_text, path)
result = await self._md_parser.parse_content(
text, source_path=str(path), instruction=instruction, **kwargs
text,
source_path=str(path),
instruction=instruction,
**kwargs,
)
else:
result = await self._md_parser.parse_content(
Expand All @@ -62,6 +70,33 @@ async def parse(self, source: Union[str, Path], instruction: str = "", **kwargs)
result.parser_name = "LegacyDocParser"
return result

async def _parse_converted_pdf(
self,
path: Path,
instruction: str,
kwargs: dict,
) -> Optional[ParseResult]:
"""Try LibreOffice conversion and reuse the PDF parser for richer .doc parsing."""
pdf_path = await self._convert_to_pdf(path)
if not pdf_path:
return None

result = await self._parse_pdf(pdf_path, instruction=instruction, **kwargs)
result.source_path = str(path)
result.source_format = "doc"
result.parser_name = "LegacyDocParser"
result.meta["converted_from"] = "doc"
result.meta["intermediate_format"] = "pdf"
return result

async def _convert_to_pdf(self, path: Path) -> Optional[Path]:
return await DocumentConverter().to_pdf(path)

async def _parse_pdf(self, path: Path, instruction: str = "", **kwargs) -> ParseResult:
from openviking.parse.parsers.pdf import PDFParser

return await PDFParser().parse(path, instruction=instruction, **kwargs)

async def parse_content(
self, content: str, source_path: Optional[str] = None, instruction: str = "", **kwargs
) -> ParseResult:
Expand Down
168 changes: 166 additions & 2 deletions openviking/parse/parsers/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,17 +20,32 @@
import hashlib
import re
import time
from pathlib import Path
from pathlib import Path, PurePosixPath
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
from urllib.parse import unquote, urlparse

from openviking.parse.accessors.mime_types import IANA_MEDIA_TYPE_TO_EXTENSION
from openviking.parse.base import NodeType, ParseResult, ResourceNode, create_parse_result
from openviking.parse.base import (
RESOURCE_ROOT_PLACEHOLDER,
RESOURCE_ROOT_PLACEHOLDER_META_KEY,
NodeType,
ParseResult,
ResourceNode,
create_parse_result,
)
from openviking.parse.parsers.base_parser import BaseParser
from openviking.parse.parsers.constants import (
CODE_EXTENSIONS,
DOCUMENTATION_EXTENSIONS,
IGNORE_EXTENSIONS,
)
from openviking.parse.parsers.image_attachments import (
SUPPORTED_IMAGE_EXTENSIONS,
image_attachment,
image_extension_from_name_type_or_data,
image_media_path,
markdown_image_reference,
)
from openviking_cli.utils.config.parser_config import ParserConfig
from openviking_cli.utils.logger import get_logger

Expand Down Expand Up @@ -195,6 +210,25 @@ async def parse_content(
f"[MarkdownParser] Extracted frontmatter: {list(frontmatter.keys())}"
)

attachments = list(kwargs.get("attachments") or [])
if base_dir:
content, local_attachments, image_warnings = self._attach_local_images(
content,
base_dir=Path(base_dir),
start_index=self._next_image_index(attachments),
)
attachments.extend(local_attachments)
warnings.extend(image_warnings)
if local_attachments:
meta["images_extracted"] = len(local_attachments)

if attachments:
meta[RESOURCE_ROOT_PLACEHOLDER_META_KEY] = RESOURCE_ROOT_PLACEHOLDER
meta.setdefault(
"images_extracted",
self._count_image_attachments(attachments),
)

# Collect metadata
# images = list(self._image_pattern.finditer(content))
# image_count = len(images)
Expand Down Expand Up @@ -241,6 +275,7 @@ async def parse_content(
source_path,
doc_name=self._sanitize_for_path(doc_title),
)
await self._write_attachments(root_dir, attachments)

parse_time = time.time() - start_time
logger.info(f"[MarkdownParser] Parse completed in {parse_time:.2f}s")
Expand Down Expand Up @@ -515,6 +550,135 @@ async def _parse_and_create_structure(
content, headings, root_dir, sections, doc_name, max_size, min_size
)

async def _write_attachments(self, root_dir: str, attachments: List[Dict[str, Any]]) -> None:
"""Write parser-produced binary attachments into the same temp resource tree."""
if not attachments:
return

viking_fs = self._get_viking_fs()
written = 0
for attachment in attachments:
rel_path = str(attachment.get("path") or "").strip()
posix_path = PurePosixPath(rel_path)
if (
not rel_path
or posix_path.is_absolute()
or any(part in {"", ".", ".."} for part in posix_path.parts)
):
logger.warning("[MarkdownParser] Skipping unsafe attachment path: %r", rel_path)
continue

content = attachment.get("content")
if not isinstance(content, (bytes, bytearray)):
logger.warning("[MarkdownParser] Skipping non-binary attachment: %s", rel_path)
continue

await viking_fs.write_file_bytes(
f"{root_dir}/{posix_path.as_posix()}",
bytes(content),
)
written += 1

if written:
logger.info("[MarkdownParser] Wrote %d attachment(s) into temp tree", written)

def _attach_local_images(
self,
content: str,
base_dir: Path,
start_index: int,
) -> Tuple[str, List[Dict[str, Any]], List[str]]:
"""Copy local Markdown image references into parser-produced attachments."""
attachments: List[Dict[str, Any]] = []
warnings: List[str] = []
copied: Dict[Path, str] = {}
next_index = start_index

def replace(match: re.Match[str]) -> str:
nonlocal next_index
alt_text = match.group(1) or "image"
raw_target = match.group(2)
image_path = self._resolve_local_image_path(raw_target, base_dir)
if not image_path:
return match.group(0)

try:
resolved_path = image_path.resolve()
if resolved_path in copied:
media_path = copied[resolved_path]
else:
image_bytes = resolved_path.read_bytes()
extension = image_extension_from_name_type_or_data(
name=resolved_path.name,
data=image_bytes,
)
next_index += 1
media_path = image_media_path(next_index, extension)
attachments.append(image_attachment(media_path, image_bytes))
copied[resolved_path] = media_path
except Exception as exc:
warnings.append(f"Failed to attach local image '{raw_target}': {exc}")
return match.group(0)

return markdown_image_reference(media_path, alt_text=alt_text)

return self._image_pattern.sub(replace, content), attachments, warnings

def _resolve_local_image_path(self, raw_target: str, base_dir: Path) -> Optional[Path]:
"""Resolve a Markdown image target when it points to a supported local file."""
target = self._extract_markdown_link_destination(raw_target)
if not target or target.startswith("#"):
return None

parsed = urlparse(target)
if parsed.scheme and parsed.scheme != "file":
return None
if parsed.netloc:
return None

raw_path = unquote(parsed.path if parsed.scheme == "file" else target)
if not raw_path:
return None

path = Path(raw_path)
if not path.is_absolute():
path = base_dir / path

if path.suffix.lower() not in SUPPORTED_IMAGE_EXTENSIONS:
return None
if not path.is_file():
return None
return path

@staticmethod
def _extract_markdown_link_destination(raw_target: str) -> str:
target = raw_target.strip()
if target.startswith("<"):
end = target.find(">")
if end > 0:
return target[1:end].strip()
if " " not in target:
return target
return target.split(maxsplit=1)[0].strip()

@staticmethod
def _next_image_index(attachments: List[Dict[str, Any]]) -> int:
highest = 0
for attachment in attachments:
path = str(attachment.get("path") or "")
match = re.search(r"(?:^|/)image-(\d+)\.[^/]+$", path)
if match:
highest = max(highest, int(match.group(1)))
return highest

@staticmethod
def _count_image_attachments(attachments: List[Dict[str, Any]]) -> int:
return sum(
1
for attachment in attachments
if str(attachment.get("path", "")).startswith("media/images/")
)

async def _process_sections_with_merge(
self,
content: str,
Expand Down
Loading
Loading