99#include < optional> // std::optional
1010#include < string> // std::string
1111#include < string_view> // std::string_view
12+ #include < vector> // std::vector
1213
1314namespace sourcemeta ::core::yaml {
1415
@@ -42,22 +43,33 @@ enum class ScalarStyle : std::uint8_t {
4243 Folded
4344};
4445
46+ enum class BlockChomping : std::uint8_t { Clip, Strip, Keep };
47+
4548struct Token {
4649 TokenType type;
4750 std::string_view value;
4851 std::uint64_t line;
4952 std::uint64_t column;
5053 std::size_t position{0 };
5154 ScalarStyle scalar_style{ScalarStyle::Plain};
55+ BlockChomping chomping{BlockChomping::Clip};
5256 bool multiline{false };
57+ std::string_view block_original{};
5358};
5459
5560class Lexer {
5661public:
57- Lexer (const std::string_view input) : input_{input} {}
62+ Lexer (const std::string_view input, const bool roundtrip_mode = false )
63+ : input_{input}, roundtrip_{roundtrip_mode} {}
5864
5965 auto next () -> std::optional<Token> {
66+ if (this ->roundtrip_ ) {
67+ this ->inline_comment_buffer_ .reset ();
68+ }
6069 this ->skip_whitespace_and_comments ();
70+ if (this ->roundtrip_ ) {
71+ this ->comment_reference_line_ = this ->line_ ;
72+ }
6173
6274 if (this ->position_ >= this ->input_ .size ()) {
6375 if (!this ->stream_started_ ) {
@@ -270,6 +282,24 @@ class Lexer {
270282 return this ->position_ ;
271283 }
272284
285+ auto take_inline_comment () -> std::optional<std::string> {
286+ auto result{std::move (this ->inline_comment_buffer_ )};
287+ this ->inline_comment_buffer_ .reset ();
288+ return result;
289+ }
290+
291+ auto take_preceding_comments () -> std::vector<std::string> {
292+ auto result{std::move (this ->preceding_comments_buffer_ )};
293+ this ->preceding_comments_buffer_ .clear ();
294+ return result;
295+ }
296+
297+ auto take_block_scalar_comment () -> std::optional<std::string> {
298+ auto result{std::move (this ->block_scalar_comment_ )};
299+ this ->block_scalar_comment_ .reset ();
300+ return result;
301+ }
302+
273303private:
274304 [[nodiscard]] static auto is_whitespace (const char character) noexcept
275305 -> bool {
@@ -312,6 +342,7 @@ class Lexer {
312342 (this ->position_ > 0 &&
313343 is_whitespace (this ->input_ [this ->position_ - 1 ]))};
314344 bool at_line_start{this ->column_ == 1 };
345+ bool blank_line{at_line_start};
315346 this ->tab_at_line_start_ = false ;
316347 while (this ->position_ < this ->input_ .size ()) {
317348 const char current{this ->peek ()};
@@ -332,19 +363,43 @@ class Lexer {
332363 }
333364
334365 if (current == ' \n ' || current == ' \r ' ) {
366+ if (this ->roundtrip_ && blank_line) {
367+ this ->preceding_comments_buffer_ .emplace_back ();
368+ }
335369 this ->advance (1 );
336370 if (current == ' \r ' && this ->peek () == ' \n ' ) {
337371 this ->advance (1 );
338372 }
339373 preceded_by_whitespace = true ;
340374 at_line_start = true ;
375+ blank_line = true ;
341376 this ->tab_at_line_start_ = false ;
342377 continue ;
343378 }
344379
345380 if (current == ' #' && preceded_by_whitespace) {
346- while (this ->position_ < this ->input_ .size () && this ->peek () != ' \n ' ) {
347- this ->advance (1 );
381+ blank_line = false ;
382+ if (this ->roundtrip_ ) {
383+ const auto comment_line{this ->line_ };
384+ const auto comment_start{this ->position_ };
385+ while (this ->position_ < this ->input_ .size () &&
386+ this ->peek () != ' \n ' ) {
387+ this ->advance (1 );
388+ }
389+ std::string text{this ->input_ .substr (
390+ comment_start, this ->position_ - comment_start)};
391+ if (comment_line == this ->comment_reference_line_ &&
392+ this ->comment_reference_line_ > 0 &&
393+ !this ->inline_comment_buffer_ .has_value ()) {
394+ this ->inline_comment_buffer_ = std::move (text);
395+ } else {
396+ this ->preceding_comments_buffer_ .push_back (std::move (text));
397+ }
398+ } else {
399+ while (this ->position_ < this ->input_ .size () &&
400+ this ->peek () != ' \n ' ) {
401+ this ->advance (1 );
402+ }
348403 }
349404 continue ;
350405 }
@@ -942,8 +997,19 @@ class Lexer {
942997 seen_header_whitespace = true ;
943998 this ->advance (1 );
944999 } else if (current == ' #' && seen_header_whitespace) {
945- while (this ->position_ < this ->input_ .size () && this ->peek () != ' \n ' ) {
946- this ->advance (1 );
1000+ if (this ->roundtrip_ ) {
1001+ const auto comment_start{this ->position_ };
1002+ while (this ->position_ < this ->input_ .size () &&
1003+ this ->peek () != ' \n ' ) {
1004+ this ->advance (1 );
1005+ }
1006+ this ->block_scalar_comment_ = std::string{this ->input_ .substr (
1007+ comment_start, this ->position_ - comment_start)};
1008+ } else {
1009+ while (this ->position_ < this ->input_ .size () &&
1010+ this ->peek () != ' \n ' ) {
1011+ this ->advance (1 );
1012+ }
9471013 }
9481014 } else if (current == ' \n ' || current == ' \r ' ) {
9491015 break ;
@@ -961,6 +1027,16 @@ class Lexer {
9611027 }
9621028
9631029 auto &buffer{this ->get_buffer ()};
1030+
1031+ // For folded scalars in roundtrip mode, build a parallel buffer that
1032+ // preserves original line breaks (literal-style) for round-trip output
1033+ const bool build_original{style == ScalarStyle::Folded && this ->roundtrip_ };
1034+ std::string *original{nullptr };
1035+ std::string original_trailing;
1036+ if (build_original) {
1037+ original = &this ->get_buffer ();
1038+ }
1039+
9641040 const auto content_indent{this ->detect_block_scalar_indent (
9651041 explicit_indent, indicator_position, start_line, start_column)};
9661042
@@ -990,6 +1066,17 @@ class Lexer {
9901066 trailing_newlines += ' \n ' ;
9911067 } else {
9921068 blank_line_count++;
1069+ if (original) {
1070+ if (line_indent > content_indent) {
1071+ *original += original_trailing;
1072+ original_trailing.clear ();
1073+ for (std::size_t index = content_indent; index < line_indent;
1074+ ++index) {
1075+ *original += ' ' ;
1076+ }
1077+ }
1078+ original_trailing += ' \n ' ;
1079+ }
9931080 }
9941081 this ->advance (1 );
9951082 if (this ->input_ [this ->position_ - 1 ] == ' \r ' && this ->peek () == ' \n ' ) {
@@ -1042,15 +1129,27 @@ class Lexer {
10421129 blank_line_count = 0 ;
10431130 had_line_break = false ;
10441131 previous_started_with_whitespace = starts_with_whitespace;
1132+
1133+ if (original) {
1134+ *original += original_trailing;
1135+ original_trailing.clear ();
1136+ }
10451137 }
10461138
10471139 for (std::size_t index = content_indent; index < line_indent; ++index) {
10481140 buffer += ' ' ;
1141+ if (original) {
1142+ *original += ' ' ;
1143+ }
10491144 }
10501145
10511146 while (this ->position_ < this ->input_ .size () && this ->peek () != ' \n ' &&
10521147 this ->peek () != ' \r ' ) {
1053- buffer += this ->peek ();
1148+ const auto character{this ->peek ()};
1149+ buffer += character;
1150+ if (original) {
1151+ *original += character;
1152+ }
10541153 this ->advance (1 );
10551154 }
10561155
@@ -1063,6 +1162,9 @@ class Lexer {
10631162 trailing_newlines += ' \n ' ;
10641163 } else {
10651164 had_line_break = true ;
1165+ if (original) {
1166+ original_trailing += ' \n ' ;
1167+ }
10661168 }
10671169 this ->advance (1 );
10681170 if (this ->input_ [this ->position_ - 1 ] == ' \r ' && this ->peek () == ' \n ' ) {
@@ -1081,6 +1183,9 @@ class Lexer {
10811183 for (std::size_t count = 0 ; count < blank_line_count; ++count) {
10821184 buffer += ' \n ' ;
10831185 }
1186+ if (original) {
1187+ *original += original_trailing;
1188+ }
10841189 }
10851190 } else if (chomping == ' c' && !buffer.empty ()) {
10861191 if (style == ScalarStyle::Literal) {
@@ -1089,14 +1194,27 @@ class Lexer {
10891194 }
10901195 } else if (had_line_break || blank_line_count > 0 ) {
10911196 buffer += ' \n ' ;
1197+ if (original && !original_trailing.empty ()) {
1198+ *original += ' \n ' ;
1199+ }
10921200 }
10931201 }
10941202
1203+ BlockChomping block_chomping{BlockChomping::Clip};
1204+ if (chomping == ' -' ) {
1205+ block_chomping = BlockChomping::Strip;
1206+ } else if (chomping == ' +' ) {
1207+ block_chomping = BlockChomping::Keep;
1208+ }
1209+
10951210 return Token{.type = TokenType::Scalar,
10961211 .value = buffer,
10971212 .line = start_line,
10981213 .column = start_column,
1099- .scalar_style = style};
1214+ .scalar_style = style,
1215+ .chomping = block_chomping,
1216+ .block_original = original ? std::string_view{*original}
1217+ : std::string_view{}};
11001218 }
11011219
11021220 auto scan_plain_scalar () -> Token {
@@ -1336,6 +1454,11 @@ class Lexer {
13361454 bool stream_ended_{false };
13371455 bool last_was_quoted_scalar_{false };
13381456 bool tab_at_line_start_{false };
1457+ bool roundtrip_{false };
1458+ std::uint64_t comment_reference_line_{0 };
1459+ std::optional<std::string> inline_comment_buffer_;
1460+ std::optional<std::string> block_scalar_comment_;
1461+ std::vector<std::string> preceding_comments_buffer_;
13391462 // SIZE_MAX means "not set" (top-level), 0 means parent at indent 0
13401463 std::size_t block_indent_{SIZE_MAX};
13411464 std::deque<std::string> scalar_buffers_;
0 commit comments