Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions shortcuts/doc/docs_create_v2.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,10 @@ func dryRunCreateV2(_ context.Context, runtime *common.RuntimeContext) *common.D
func executeCreateV2(_ context.Context, runtime *common.RuntimeContext) error {
body := buildCreateBody(runtime)

if err := applyChunkingToBody(body, "content", runtime.Str("doc-format")); err != nil {
return err
}

data, err := doDocAPI(runtime, "POST", "/open-apis/docs_ai/v1/documents", body)
if err != nil {
return err
Expand Down
4 changes: 4 additions & 0 deletions shortcuts/doc/docs_update_v2.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,10 @@ func executeUpdateV2(_ context.Context, runtime *common.RuntimeContext) error {
apiPath := fmt.Sprintf("/open-apis/docs_ai/v1/documents/%s", ref.Token)
body := buildUpdateBody(runtime)

if err := applyChunkingToBody(body, "content", runtime.Str("doc-format")); err != nil {
return err
}

data, err := doDocAPI(runtime, "PUT", apiPath, body)
if err != nil {
return err
Expand Down
189 changes: 189 additions & 0 deletions shortcuts/doc/markdown_chunk.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
// Copyright (c) 2026 Lark Technologies Pte. Ltd.
// SPDX-License-Identifier: MIT

package doc

import (
"errors"
"strings"
"unicode/utf8"

"github.com/larksuite/cli/internal/output"
)

const SafeParagraphLimit = 10000

var ErrUnsafeMarkdown = errors.New("oversized markdown contains complex structural elements (fenced code blocks, tables, blockquotes, or HTML) that cannot be safely split. Please manually split the content below 10,000 characters before uploading")

func isTableAlignmentRow(line string) bool {
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The name SafeParagraphLimit doesn't convey that the unit is runes (Unicode code points), not bytes or grapheme clusters. Consider renaming to SafeParagraphRuneLimit or adding a comment like // SafeParagraphLimit is the maximum number of Unicode code points (runes) per chunk. to make the unit explicit.

line = strings.TrimSpace(line)
if line == "" || !strings.Contains(line, "-") {
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The error message says "below 10,000 characters", but SafeParagraphLimit is actually 10,000 runes (Unicode code points). For CJK text or emoji, a single grapheme cluster may consist of multiple runes, so "characters" can be ambiguous. Suggest changing to "10,000 Unicode code points (runes)" or "10,000 runes" for precision.

return false
}
for _, r := range line {
if r != '|' && r != '-' && r != ':' && r != ' ' && r != '\t' {
return false
}
}
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

False positive on setext-style headings.

isTableAlignmentRow("--- ---") returns true because the line only contains -, , and no disallowed characters. However, in Markdown, a line of --- under text is a setext h2 heading, not a table alignment row:

Section Title
---

This means any oversized document using setext headings will be incorrectly flagged as containing a table and rejected with ErrUnsafeMarkdown.

Suggestion: require at least one | character in the line for it to qualify as a table alignment row, or check the preceding line for pipe characters. This would also fix the test case {"no pipe at all", "--- ---", true} which should likely be false.

return true
}

func containsUnsafeMarkdown(md string) bool {
lines := strings.Split(md, "\n")

for i, line := range lines {
trimmed := strings.TrimSpace(line)
if trimmed == "" {
continue
}

if strings.HasPrefix(trimmed, "```") || strings.HasPrefix(trimmed, "~~~") {
return true
}

if strings.HasPrefix(trimmed, ">") {
return true
}

temp := trimmed
for {
idx := strings.IndexByte(temp, '<')
if idx == -1 || idx+1 >= len(temp) {
break
}
next := temp[idx+1]
if (next >= 'a' && next <= 'z') || (next >= 'A' && next <= 'Z') || next == '/' || next == '!' {
return true
}
temp = temp[idx+1:]
}

if isTableAlignmentRow(trimmed) {
return true
}
if strings.Contains(trimmed, "|") && i+1 < len(lines) {
if isTableAlignmentRow(strings.TrimSpace(lines[i+1])) {
return true
}
}
}
return false
}

func splitPlainParagraphs(md string) []string {
var paragraphs []string
var current strings.Builder
hasContent := false

for _, line := range strings.Split(md, "\n") {
if strings.TrimSpace(line) == "" {
if hasContent {
paragraphs = append(paragraphs, current.String())
current.Reset()
hasContent = false
}
continue
}
if hasContent {
current.WriteByte('\n')
}
current.WriteString(line)
hasContent = true
}
if hasContent {
paragraphs = append(paragraphs, current.String())
}

return paragraphs
}

func splitOversizedParagraph(para string) []string {
if utf8.RuneCountInString(para) <= SafeParagraphLimit {
return []string{para}
}

var chunks []string
pos := 0

for pos < len(para) {
remaining := para[pos:]
if utf8.RuneCountInString(remaining) <= SafeParagraphLimit {
chunks = append(chunks, remaining)
break
}

limitByte := runeOffset(remaining, SafeParagraphLimit)
splitAt := findSplitPoint(remaining, limitByte)

chunks = append(chunks, remaining[:splitAt])
pos += splitAt
}

return chunks
}

func runeOffset(s string, n int) int {
offset := 0
for i := 0; i < n && offset < len(s); i++ {
_, size := utf8.DecodeRuneInString(s[offset:])
offset += size
}
return offset
}

func findSplitPoint(s string, limitByte int) int {
minByte := limitByte * 3 / 4

for i := limitByte - 1; i >= minByte; i-- {
if s[i] == '\n' {
return i + 1
}
}

for i := limitByte - 1; i >= minByte; i-- {
if s[i] == ' ' {
return i + 1
}
}

return limitByte
}

func chunkMarkdownForUpload(md string) (string, error) {
if md == "" {
return "", nil
}
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

minByte is not guaranteed to be on a rune boundary.

minByte = limitByte * 3 / 4 is a pure byte-level arithmetic calculation. If the string contains multi-byte UTF-8 characters (CJK, emoji, etc.), minByte may land in the middle of a rune.

Currently this is safe in practice because the loop only checks for ASCII bytes (\n and ' '), so it will never slice at minByte directly. However, the invariant is implicit and fragile — if someone later adds a non-ASCII split character check, it would silently produce invalid UTF-8.

Suggestion: either add a comment explicitly documenting this invariant, or align minByte to the next rune boundary:

minByte := limitByte * 3 / 4
// Align to rune boundary
for minByte < len(s) && !utf8.RuneStart(s[minByte]) {
    minByte++
}


if utf8.RuneCountInString(md) <= SafeParagraphLimit {
return md, nil
}

if containsUnsafeMarkdown(md) {
return "", ErrUnsafeMarkdown
}

paragraphs := splitPlainParagraphs(md)
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When no newline or space is found in the search range, findSplitPoint falls back to limitByte. This is correct because limitByte comes from runeOffset and is guaranteed to be on a rune boundary.

However, the intermediate positions checked in the loops above (i from limitByte-1 down to minByte) are not guaranteed to be on rune boundaries. When findSplitPoint returns i+1 (after finding \n or ), that position happens to be safe because \n and are single-byte ASCII characters, so i+1 is always a valid rune start.

This works today, but the correctness depends on an unstated assumption. Consider adding a comment or a debug assertion:

// Safe because \n and ' ' are single-byte ASCII runes,
// so i+1 is always a valid rune boundary.
return i + 1

var chunks []string

for _, para := range paragraphs {
chunks = append(chunks, splitOversizedParagraph(para)...)
}

return strings.Join(chunks, "\n\n"), nil
}

func applyChunkingToBody(body map[string]interface{}, contentKey, docFormat string) error {
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Re-joining with "\n\n" changes the original whitespace structure.

If the original markdown had 3+ blank lines between paragraphs, or if splitOversizedParagraph splits at a newline causing a trailing \n in a chunk, the re-joined output will have different whitespace than the input. This means chunkMarkdownForUpload is not truly idempotent — calling it twice on the same input could produce different results.

For rendering purposes this is usually fine, but it's worth documenting this behavior. Alternatively, you could track the original separator between paragraphs and restore it on re-join.

if docFormat != "markdown" {
return nil
}
content, _ := body[contentKey].(string)
if content == "" {
return nil
}
chunked, err := chunkMarkdownForUpload(content)
if err != nil {
return output.Errorf(output.ExitAPI, "chunk_error", "%v", err)
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Silent type assertion ignores non-string content.

content, _ := body[contentKey].(string)

If body[contentKey] exists but is not a string (e.g., []byte, json.RawMessage), the type assertion fails silently and content becomes "". The function then returns nil with no error, effectively skipping chunking for that content.

This could lead to hard-to-debug issues where oversized content silently bypasses chunking. Consider either:

  1. Returning an error when the key exists but isn't a string, or
  2. Logging a warning, or
  3. Adding a comment explaining why silent fallback is intentional.

}
body[contentKey] = chunked
return nil
}
Loading
Loading