Skip to content

Commit d42c3df

Browse files
committed
fix(scrape): emit markdown table separators
1 parent 7954d02 commit d42c3df

2 files changed

Lines changed: 104 additions & 6 deletions

File tree

src/cortex-cli/src/scrape_cmd/html.rs

Lines changed: 85 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -380,15 +380,11 @@ fn process_node_to_markdown(
380380
// Tables
381381
"table" => {
382382
output.push_str("\n\n");
383-
process_node_to_markdown(
383+
output.push_str(&render_table_markdown(
384384
element_ref,
385-
output,
386-
list_depth,
387-
in_pre,
388-
in_code,
389385
no_images,
390386
no_links,
391-
);
387+
));
392388
output.push_str("\n\n");
393389
}
394390
"thead" | "tbody" | "tfoot" => {
@@ -448,6 +444,89 @@ fn process_node_to_markdown(
448444
}
449445
}
450446

447+
fn render_table_markdown(table: scraper::ElementRef, no_images: bool, no_links: bool) -> String {
448+
let mut rows = Vec::new();
449+
collect_table_rows(table, &mut rows, no_images, no_links);
450+
451+
if rows.is_empty() {
452+
return String::new();
453+
}
454+
455+
let mut output = String::new();
456+
for (row_index, row) in rows.iter().enumerate() {
457+
output.push_str("| ");
458+
output.push_str(&row.join(" | "));
459+
output.push_str(" |\n");
460+
461+
if row_index == 0 {
462+
output.push_str("| ");
463+
output.push_str(&vec!["---"; row.len().max(1)].join(" | "));
464+
output.push_str(" |\n");
465+
}
466+
}
467+
468+
output
469+
}
470+
471+
fn collect_table_rows(
472+
node: scraper::ElementRef,
473+
rows: &mut Vec<Vec<String>>,
474+
no_images: bool,
475+
no_links: bool,
476+
) {
477+
for child in node.children() {
478+
if let Some(element_ref) = scraper::ElementRef::wrap(child) {
479+
match element_ref.value().name.local.as_ref() {
480+
"tr" => rows.push(collect_table_cells(element_ref, no_images, no_links)),
481+
"thead" | "tbody" | "tfoot" => {
482+
collect_table_rows(element_ref, rows, no_images, no_links);
483+
}
484+
_ => {}
485+
}
486+
}
487+
}
488+
}
489+
490+
fn collect_table_cells(row: scraper::ElementRef, no_images: bool, no_links: bool) -> Vec<String> {
491+
let mut cells = Vec::new();
492+
493+
for child in row.children() {
494+
if let Some(element_ref) = scraper::ElementRef::wrap(child)
495+
&& matches!(element_ref.value().name.local.as_ref(), "th" | "td")
496+
{
497+
cells.push(render_table_cell(element_ref, no_images, no_links));
498+
}
499+
}
500+
501+
cells
502+
}
503+
504+
fn render_table_cell(cell: scraper::ElementRef, no_images: bool, no_links: bool) -> String {
505+
let mut output = String::new();
506+
let mut list_depth = 0;
507+
let mut in_pre = false;
508+
let mut in_code = false;
509+
510+
process_node_to_markdown(
511+
cell,
512+
&mut output,
513+
&mut list_depth,
514+
&mut in_pre,
515+
&mut in_code,
516+
no_images,
517+
no_links,
518+
);
519+
520+
clean_table_cell(&output)
521+
}
522+
523+
fn clean_table_cell(cell: &str) -> String {
524+
normalize_whitespace(&cell.replace('\n', " "))
525+
.replace('|', "\\|")
526+
.trim()
527+
.to_string()
528+
}
529+
451530
/// Convert HTML to plain text.
452531
pub fn html_to_text(html: &str) -> String {
453532
let cleaned = remove_unwanted_elements(html);

src/cortex-cli/src/scrape_cmd/tests.rs

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,25 @@ mod tests {
4141
assert!(!md_no_images.contains("!["));
4242
}
4343

44+
#[test]
45+
fn test_html_to_markdown_table_includes_separator() {
46+
let html = r#"
47+
<table>
48+
<thead>
49+
<tr><th>Name</th><th>Age</th></tr>
50+
</thead>
51+
<tbody>
52+
<tr><td>Alice</td><td>30</td></tr>
53+
</tbody>
54+
</table>
55+
"#;
56+
let md = html_to_markdown(html, false, false);
57+
58+
assert!(md.contains("| Name | Age |"));
59+
assert!(md.contains("| --- | --- |"));
60+
assert!(md.contains("| Alice | 30 |"));
61+
}
62+
4463
#[test]
4564
fn test_html_to_text() {
4665
let html = "<h1>Title</h1><p>Hello <strong>world</strong>!</p>";

0 commit comments

Comments
 (0)