Skip to content

Commit b52ff33

Browse files
committed
fix(scrape): preserve ordered list markers
1 parent 7954d02 commit b52ff33

3 files changed

Lines changed: 48 additions & 4 deletions

File tree

src/cortex-cli/src/agent_cmd/tests.rs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,9 @@
33
#[cfg(test)]
44
mod tests {
55
use crate::agent_cmd::cli::{CopyArgs, ExportArgs};
6-
use crate::agent_cmd::loader::{
7-
load_builtin_agents, parse_frontmatter, read_file_with_encoding,
8-
};
6+
use crate::agent_cmd::loader::{load_builtin_agents, parse_frontmatter};
97
use crate::agent_cmd::types::AgentMode;
8+
use crate::utils::file::read_file_with_encoding;
109

1110
#[test]
1211
fn test_read_file_with_utf8() {

src/cortex-cli/src/scrape_cmd/html.rs

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -311,7 +311,8 @@ fn process_node_to_markdown(
311311
}
312312
"li" => {
313313
let indent = " ".repeat(list_depth.saturating_sub(1));
314-
output.push_str(&format!("\n{indent}- "));
314+
let marker = list_item_marker(element_ref);
315+
output.push_str(&format!("\n{indent}{marker}"));
315316
process_node_to_markdown(
316317
element_ref,
317318
output,
@@ -448,6 +449,28 @@ fn process_node_to_markdown(
448449
}
449450
}
450451

452+
fn list_item_marker(element_ref: scraper::ElementRef) -> String {
453+
let Some(parent) = element_ref.parent().and_then(scraper::ElementRef::wrap) else {
454+
return "- ".to_string();
455+
};
456+
457+
if parent.value().name() != "ol" {
458+
return "- ".to_string();
459+
}
460+
461+
let start = parent
462+
.attr("start")
463+
.and_then(|value| value.parse::<usize>().ok())
464+
.unwrap_or(1);
465+
let previous_items = element_ref
466+
.prev_siblings()
467+
.filter_map(scraper::ElementRef::wrap)
468+
.filter(|sibling| sibling.value().name() == "li")
469+
.count();
470+
471+
format!("{}. ", start + previous_items)
472+
}
473+
451474
/// Convert HTML to plain text.
452475
pub fn html_to_text(html: &str) -> String {
453476
let cleaned = remove_unwanted_elements(html);

src/cortex-cli/src/scrape_cmd/tests.rs

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,28 @@ mod tests {
4141
assert!(!md_no_images.contains("!["));
4242
}
4343

44+
#[test]
45+
fn test_html_to_markdown_ordered_lists() {
46+
let html = r#"
47+
<ol>
48+
<li>Install Rust</li>
49+
<li>Run cortex</li>
50+
<li>Check output<ul><li>Keep nested unordered item</li></ul></li>
51+
</ol>
52+
<ol start="4">
53+
<li>Continue numbering</li>
54+
</ol>
55+
"#;
56+
let md = html_to_markdown(html, false, false);
57+
58+
assert!(md.contains("1. Install Rust"), "got: {md}");
59+
assert!(md.contains("2. Run cortex"), "got: {md}");
60+
assert!(md.contains("3. Check output"), "got: {md}");
61+
assert!(md.contains(" - Keep nested unordered item"), "got: {md}");
62+
assert!(md.contains("4. Continue numbering"), "got: {md}");
63+
assert!(!md.contains("- Install Rust"), "got: {md}");
64+
}
65+
4466
#[test]
4567
fn test_html_to_text() {
4668
let html = "<h1>Title</h1><p>Hello <strong>world</strong>!</p>";

0 commit comments

Comments
 (0)