-
Notifications
You must be signed in to change notification settings - Fork 823
feat(docs): support auto-chunking for oversized plain markdown text #964
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,189 @@ | ||
| // Copyright (c) 2026 Lark Technologies Pte. Ltd. | ||
| // SPDX-License-Identifier: MIT | ||
|
|
||
| package doc | ||
|
|
||
| import ( | ||
| "errors" | ||
| "strings" | ||
| "unicode/utf8" | ||
|
|
||
| "github.com/larksuite/cli/internal/output" | ||
| ) | ||
|
|
||
| const SafeParagraphLimit = 10000 | ||
|
|
||
| var ErrUnsafeMarkdown = errors.New("oversized markdown contains complex structural elements (fenced code blocks, tables, blockquotes, or HTML) that cannot be safely split. Please manually split the content below 10,000 characters before uploading") | ||
|
|
||
| func isTableAlignmentRow(line string) bool { | ||
| line = strings.TrimSpace(line) | ||
| if line == "" || !strings.Contains(line, "-") { | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The error message says "below 10,000 characters", but |
||
| return false | ||
| } | ||
| for _, r := range line { | ||
| if r != '|' && r != '-' && r != ':' && r != ' ' && r != '\t' { | ||
| return false | ||
| } | ||
| } | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. False positive on setext-style headings.
Section Title
---This means any oversized document using setext headings will be incorrectly flagged as containing a table and rejected with Suggestion: require at least one |
||
| return true | ||
| } | ||
|
|
||
| func containsUnsafeMarkdown(md string) bool { | ||
| lines := strings.Split(md, "\n") | ||
|
|
||
| for i, line := range lines { | ||
| trimmed := strings.TrimSpace(line) | ||
| if trimmed == "" { | ||
| continue | ||
| } | ||
|
|
||
| if strings.HasPrefix(trimmed, "```") || strings.HasPrefix(trimmed, "~~~") { | ||
| return true | ||
| } | ||
|
|
||
| if strings.HasPrefix(trimmed, ">") { | ||
| return true | ||
| } | ||
|
|
||
| temp := trimmed | ||
| for { | ||
| idx := strings.IndexByte(temp, '<') | ||
| if idx == -1 || idx+1 >= len(temp) { | ||
| break | ||
| } | ||
| next := temp[idx+1] | ||
| if (next >= 'a' && next <= 'z') || (next >= 'A' && next <= 'Z') || next == '/' || next == '!' { | ||
| return true | ||
| } | ||
| temp = temp[idx+1:] | ||
| } | ||
|
|
||
| if isTableAlignmentRow(trimmed) { | ||
| return true | ||
| } | ||
| if strings.Contains(trimmed, "|") && i+1 < len(lines) { | ||
| if isTableAlignmentRow(strings.TrimSpace(lines[i+1])) { | ||
| return true | ||
| } | ||
| } | ||
| } | ||
| return false | ||
| } | ||
|
|
||
| func splitPlainParagraphs(md string) []string { | ||
| var paragraphs []string | ||
| var current strings.Builder | ||
| hasContent := false | ||
|
|
||
| for _, line := range strings.Split(md, "\n") { | ||
| if strings.TrimSpace(line) == "" { | ||
| if hasContent { | ||
| paragraphs = append(paragraphs, current.String()) | ||
| current.Reset() | ||
| hasContent = false | ||
| } | ||
| continue | ||
| } | ||
| if hasContent { | ||
| current.WriteByte('\n') | ||
| } | ||
| current.WriteString(line) | ||
| hasContent = true | ||
| } | ||
| if hasContent { | ||
| paragraphs = append(paragraphs, current.String()) | ||
| } | ||
|
|
||
| return paragraphs | ||
| } | ||
|
|
||
| func splitOversizedParagraph(para string) []string { | ||
| if utf8.RuneCountInString(para) <= SafeParagraphLimit { | ||
| return []string{para} | ||
| } | ||
|
|
||
| var chunks []string | ||
| pos := 0 | ||
|
|
||
| for pos < len(para) { | ||
| remaining := para[pos:] | ||
| if utf8.RuneCountInString(remaining) <= SafeParagraphLimit { | ||
| chunks = append(chunks, remaining) | ||
| break | ||
| } | ||
|
|
||
| limitByte := runeOffset(remaining, SafeParagraphLimit) | ||
| splitAt := findSplitPoint(remaining, limitByte) | ||
|
|
||
| chunks = append(chunks, remaining[:splitAt]) | ||
| pos += splitAt | ||
| } | ||
|
|
||
| return chunks | ||
| } | ||
|
|
||
| func runeOffset(s string, n int) int { | ||
| offset := 0 | ||
| for i := 0; i < n && offset < len(s); i++ { | ||
| _, size := utf8.DecodeRuneInString(s[offset:]) | ||
| offset += size | ||
| } | ||
| return offset | ||
| } | ||
|
|
||
| func findSplitPoint(s string, limitByte int) int { | ||
| minByte := limitByte * 3 / 4 | ||
|
|
||
| for i := limitByte - 1; i >= minByte; i-- { | ||
| if s[i] == '\n' { | ||
| return i + 1 | ||
| } | ||
| } | ||
|
|
||
| for i := limitByte - 1; i >= minByte; i-- { | ||
| if s[i] == ' ' { | ||
| return i + 1 | ||
| } | ||
| } | ||
|
|
||
| return limitByte | ||
| } | ||
|
|
||
| func chunkMarkdownForUpload(md string) (string, error) { | ||
| if md == "" { | ||
| return "", nil | ||
| } | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Currently this is safe in practice because the loop only checks for ASCII bytes ( Suggestion: either add a comment explicitly documenting this invariant, or align minByte := limitByte * 3 / 4
// Align to rune boundary
for minByte < len(s) && !utf8.RuneStart(s[minByte]) {
minByte++
} |
||
|
|
||
| if utf8.RuneCountInString(md) <= SafeParagraphLimit { | ||
| return md, nil | ||
| } | ||
|
|
||
| if containsUnsafeMarkdown(md) { | ||
| return "", ErrUnsafeMarkdown | ||
| } | ||
|
|
||
| paragraphs := splitPlainParagraphs(md) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When no newline or space is found in the search range, However, the intermediate positions checked in the loops above ( This works today, but the correctness depends on an unstated assumption. Consider adding a comment or a debug assertion: // Safe because \n and ' ' are single-byte ASCII runes,
// so i+1 is always a valid rune boundary.
return i + 1 |
||
| var chunks []string | ||
|
|
||
| for _, para := range paragraphs { | ||
| chunks = append(chunks, splitOversizedParagraph(para)...) | ||
| } | ||
|
|
||
| return strings.Join(chunks, "\n\n"), nil | ||
| } | ||
|
|
||
| func applyChunkingToBody(body map[string]interface{}, contentKey, docFormat string) error { | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Re-joining with If the original markdown had 3+ blank lines between paragraphs, or if For rendering purposes this is usually fine, but it's worth documenting this behavior. Alternatively, you could track the original separator between paragraphs and restore it on re-join. |
||
| if docFormat != "markdown" { | ||
| return nil | ||
| } | ||
| content, _ := body[contentKey].(string) | ||
| if content == "" { | ||
| return nil | ||
| } | ||
| chunked, err := chunkMarkdownForUpload(content) | ||
| if err != nil { | ||
| return output.Errorf(output.ExitAPI, "chunk_error", "%v", err) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Silent type assertion ignores non-string content. content, _ := body[contentKey].(string)If This could lead to hard-to-debug issues where oversized content silently bypasses chunking. Consider either:
|
||
| } | ||
| body[contentKey] = chunked | ||
| return nil | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The name
SafeParagraphLimitdoesn't convey that the unit is runes (Unicode code points), not bytes or grapheme clusters. Consider renaming toSafeParagraphRuneLimitor adding a comment like// SafeParagraphLimit is the maximum number of Unicode code points (runes) per chunk.to make the unit explicit.