Skip to content

Commit dbf8e44

Browse files
committed
Improvements
Signed-off-by: Juan Cruz Viotti <jv@jviotti.com>
1 parent abd1310 commit dbf8e44

8 files changed

Lines changed: 3799 additions & 48 deletions

File tree

src/core/yaml/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,5 +7,6 @@ if(SOURCEMETA_CORE_INSTALL)
77
endif()
88

99
target_link_libraries(sourcemeta_core_yaml PUBLIC sourcemeta::core::json)
10+
target_link_libraries(sourcemeta_core_yaml PUBLIC sourcemeta::core::jsonpointer)
1011
target_link_libraries(sourcemeta_core_yaml PRIVATE sourcemeta::core::io)
1112
target_link_libraries(sourcemeta_core_yaml PRIVATE sourcemeta::core::unicode)

src/core/yaml/include/sourcemeta/core/yaml_roundtrip.h

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,62 @@
55
#include <sourcemeta/core/yaml_export.h>
66
#endif
77

8+
#include <sourcemeta/core/json.h>
9+
#include <sourcemeta/core/jsonpointer.h>
10+
11+
#include <cstdint> // std::uint8_t, std::size_t
12+
#include <map> // std::map
13+
#include <optional> // std::optional
14+
#include <string> // std::string
15+
#include <vector> // std::vector
16+
817
namespace sourcemeta::core {
918

19+
/// @ingroup yaml
20+
enum class YAMLScalarStyle : std::uint8_t {
21+
Plain,
22+
SingleQuoted,
23+
DoubleQuoted,
24+
Literal,
25+
Folded
26+
};
27+
28+
/// @ingroup yaml
29+
enum class YAMLCollectionStyle : std::uint8_t { Block, Flow };
30+
31+
/// @ingroup yaml
32+
enum class YAMLChomping : std::uint8_t { Clip, Strip, Keep };
33+
34+
/// @ingroup yaml
35+
struct SOURCEMETA_CORE_YAML_EXPORT YAMLNodeStyle {
36+
std::optional<YAMLScalarStyle> scalar;
37+
std::optional<YAMLCollectionStyle> collection;
38+
std::optional<YAMLChomping> chomping;
39+
std::optional<std::string> block_content;
40+
std::optional<std::string> anchor;
41+
std::vector<std::string> comments_before;
42+
std::optional<std::string> comment_inline;
43+
std::optional<std::string> comment_on_indicator;
44+
};
45+
1046
/// @ingroup yaml
1147
/// Holds per-node metadata collected during YAML parsing to reproduce the
1248
/// original formatting
1349
class SOURCEMETA_CORE_YAML_EXPORT YAMLRoundTrip {
1450
public:
1551
auto clear() -> void;
52+
std::map<Pointer, YAMLNodeStyle> styles;
53+
std::map<Pointer, std::string> aliases;
54+
std::map<Pointer, YAMLScalarStyle> key_styles;
55+
bool explicit_document_start{false};
56+
bool explicit_document_end{false};
57+
std::optional<std::string> document_start_comment;
58+
std::optional<std::string> document_end_comment;
59+
std::vector<std::string> leading_comments;
60+
std::vector<std::string> post_start_comments;
61+
std::vector<std::string> pre_end_comments;
62+
std::vector<std::string> trailing_comments;
63+
std::size_t indent_width{2};
1664
};
1765

1866
} // namespace sourcemeta::core

src/core/yaml/lexer.h

Lines changed: 130 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include <optional> // std::optional
1010
#include <string> // std::string
1111
#include <string_view> // std::string_view
12+
#include <vector> // std::vector
1213

1314
namespace sourcemeta::core::yaml {
1415

@@ -42,22 +43,33 @@ enum class ScalarStyle : std::uint8_t {
4243
Folded
4344
};
4445

46+
enum class BlockChomping : std::uint8_t { Clip, Strip, Keep };
47+
4548
struct Token {
4649
TokenType type;
4750
std::string_view value;
4851
std::uint64_t line;
4952
std::uint64_t column;
5053
std::size_t position{0};
5154
ScalarStyle scalar_style{ScalarStyle::Plain};
55+
BlockChomping chomping{BlockChomping::Clip};
5256
bool multiline{false};
57+
std::string_view block_original{};
5358
};
5459

5560
class Lexer {
5661
public:
57-
Lexer(const std::string_view input) : input_{input} {}
62+
Lexer(const std::string_view input, const bool roundtrip_mode = false)
63+
: input_{input}, roundtrip_{roundtrip_mode} {}
5864

5965
auto next() -> std::optional<Token> {
66+
if (this->roundtrip_) {
67+
this->inline_comment_buffer_.reset();
68+
}
6069
this->skip_whitespace_and_comments();
70+
if (this->roundtrip_) {
71+
this->comment_reference_line_ = this->line_;
72+
}
6173

6274
if (this->position_ >= this->input_.size()) {
6375
if (!this->stream_started_) {
@@ -270,6 +282,24 @@ class Lexer {
270282
return this->position_;
271283
}
272284

285+
auto take_inline_comment() -> std::optional<std::string> {
286+
auto result{std::move(this->inline_comment_buffer_)};
287+
this->inline_comment_buffer_.reset();
288+
return result;
289+
}
290+
291+
auto take_preceding_comments() -> std::vector<std::string> {
292+
auto result{std::move(this->preceding_comments_buffer_)};
293+
this->preceding_comments_buffer_.clear();
294+
return result;
295+
}
296+
297+
auto take_block_scalar_comment() -> std::optional<std::string> {
298+
auto result{std::move(this->block_scalar_comment_)};
299+
this->block_scalar_comment_.reset();
300+
return result;
301+
}
302+
273303
private:
274304
[[nodiscard]] static auto is_whitespace(const char character) noexcept
275305
-> bool {
@@ -312,6 +342,7 @@ class Lexer {
312342
(this->position_ > 0 &&
313343
is_whitespace(this->input_[this->position_ - 1]))};
314344
bool at_line_start{this->column_ == 1};
345+
bool blank_line{at_line_start};
315346
this->tab_at_line_start_ = false;
316347
while (this->position_ < this->input_.size()) {
317348
const char current{this->peek()};
@@ -332,19 +363,43 @@ class Lexer {
332363
}
333364

334365
if (current == '\n' || current == '\r') {
366+
if (this->roundtrip_ && blank_line) {
367+
this->preceding_comments_buffer_.emplace_back();
368+
}
335369
this->advance(1);
336370
if (current == '\r' && this->peek() == '\n') {
337371
this->advance(1);
338372
}
339373
preceded_by_whitespace = true;
340374
at_line_start = true;
375+
blank_line = true;
341376
this->tab_at_line_start_ = false;
342377
continue;
343378
}
344379

345380
if (current == '#' && preceded_by_whitespace) {
346-
while (this->position_ < this->input_.size() && this->peek() != '\n') {
347-
this->advance(1);
381+
blank_line = false;
382+
if (this->roundtrip_) {
383+
const auto comment_line{this->line_};
384+
const auto comment_start{this->position_};
385+
while (this->position_ < this->input_.size() &&
386+
this->peek() != '\n') {
387+
this->advance(1);
388+
}
389+
std::string text{this->input_.substr(
390+
comment_start, this->position_ - comment_start)};
391+
if (comment_line == this->comment_reference_line_ &&
392+
this->comment_reference_line_ > 0 &&
393+
!this->inline_comment_buffer_.has_value()) {
394+
this->inline_comment_buffer_ = std::move(text);
395+
} else {
396+
this->preceding_comments_buffer_.push_back(std::move(text));
397+
}
398+
} else {
399+
while (this->position_ < this->input_.size() &&
400+
this->peek() != '\n') {
401+
this->advance(1);
402+
}
348403
}
349404
continue;
350405
}
@@ -942,8 +997,19 @@ class Lexer {
942997
seen_header_whitespace = true;
943998
this->advance(1);
944999
} else if (current == '#' && seen_header_whitespace) {
945-
while (this->position_ < this->input_.size() && this->peek() != '\n') {
946-
this->advance(1);
1000+
if (this->roundtrip_) {
1001+
const auto comment_start{this->position_};
1002+
while (this->position_ < this->input_.size() &&
1003+
this->peek() != '\n') {
1004+
this->advance(1);
1005+
}
1006+
this->block_scalar_comment_ = std::string{this->input_.substr(
1007+
comment_start, this->position_ - comment_start)};
1008+
} else {
1009+
while (this->position_ < this->input_.size() &&
1010+
this->peek() != '\n') {
1011+
this->advance(1);
1012+
}
9471013
}
9481014
} else if (current == '\n' || current == '\r') {
9491015
break;
@@ -961,6 +1027,16 @@ class Lexer {
9611027
}
9621028

9631029
auto &buffer{this->get_buffer()};
1030+
1031+
// For folded scalars in roundtrip mode, build a parallel buffer that
1032+
// preserves original line breaks (literal-style) for round-trip output
1033+
const bool build_original{style == ScalarStyle::Folded && this->roundtrip_};
1034+
std::string *original{nullptr};
1035+
std::string original_trailing;
1036+
if (build_original) {
1037+
original = &this->get_buffer();
1038+
}
1039+
9641040
const auto content_indent{this->detect_block_scalar_indent(
9651041
explicit_indent, indicator_position, start_line, start_column)};
9661042

@@ -990,6 +1066,17 @@ class Lexer {
9901066
trailing_newlines += '\n';
9911067
} else {
9921068
blank_line_count++;
1069+
if (original) {
1070+
if (line_indent > content_indent) {
1071+
*original += original_trailing;
1072+
original_trailing.clear();
1073+
for (std::size_t index = content_indent; index < line_indent;
1074+
++index) {
1075+
*original += ' ';
1076+
}
1077+
}
1078+
original_trailing += '\n';
1079+
}
9931080
}
9941081
this->advance(1);
9951082
if (this->input_[this->position_ - 1] == '\r' && this->peek() == '\n') {
@@ -1042,15 +1129,27 @@ class Lexer {
10421129
blank_line_count = 0;
10431130
had_line_break = false;
10441131
previous_started_with_whitespace = starts_with_whitespace;
1132+
1133+
if (original) {
1134+
*original += original_trailing;
1135+
original_trailing.clear();
1136+
}
10451137
}
10461138

10471139
for (std::size_t index = content_indent; index < line_indent; ++index) {
10481140
buffer += ' ';
1141+
if (original) {
1142+
*original += ' ';
1143+
}
10491144
}
10501145

10511146
while (this->position_ < this->input_.size() && this->peek() != '\n' &&
10521147
this->peek() != '\r') {
1053-
buffer += this->peek();
1148+
const auto character{this->peek()};
1149+
buffer += character;
1150+
if (original) {
1151+
*original += character;
1152+
}
10541153
this->advance(1);
10551154
}
10561155

@@ -1063,6 +1162,9 @@ class Lexer {
10631162
trailing_newlines += '\n';
10641163
} else {
10651164
had_line_break = true;
1165+
if (original) {
1166+
original_trailing += '\n';
1167+
}
10661168
}
10671169
this->advance(1);
10681170
if (this->input_[this->position_ - 1] == '\r' && this->peek() == '\n') {
@@ -1081,6 +1183,9 @@ class Lexer {
10811183
for (std::size_t count = 0; count < blank_line_count; ++count) {
10821184
buffer += '\n';
10831185
}
1186+
if (original) {
1187+
*original += original_trailing;
1188+
}
10841189
}
10851190
} else if (chomping == 'c' && !buffer.empty()) {
10861191
if (style == ScalarStyle::Literal) {
@@ -1089,14 +1194,27 @@ class Lexer {
10891194
}
10901195
} else if (had_line_break || blank_line_count > 0) {
10911196
buffer += '\n';
1197+
if (original && !original_trailing.empty()) {
1198+
*original += '\n';
1199+
}
10921200
}
10931201
}
10941202

1203+
BlockChomping block_chomping{BlockChomping::Clip};
1204+
if (chomping == '-') {
1205+
block_chomping = BlockChomping::Strip;
1206+
} else if (chomping == '+') {
1207+
block_chomping = BlockChomping::Keep;
1208+
}
1209+
10951210
return Token{.type = TokenType::Scalar,
10961211
.value = buffer,
10971212
.line = start_line,
10981213
.column = start_column,
1099-
.scalar_style = style};
1214+
.scalar_style = style,
1215+
.chomping = block_chomping,
1216+
.block_original = original ? std::string_view{*original}
1217+
: std::string_view{}};
11001218
}
11011219

11021220
auto scan_plain_scalar() -> Token {
@@ -1336,6 +1454,11 @@ class Lexer {
13361454
bool stream_ended_{false};
13371455
bool last_was_quoted_scalar_{false};
13381456
bool tab_at_line_start_{false};
1457+
bool roundtrip_{false};
1458+
std::uint64_t comment_reference_line_{0};
1459+
std::optional<std::string> inline_comment_buffer_;
1460+
std::optional<std::string> block_scalar_comment_;
1461+
std::vector<std::string> preceding_comments_buffer_;
13391462
// SIZE_MAX means "not set" (top-level), 0 means parent at indent 0
13401463
std::size_t block_indent_{SIZE_MAX};
13411464
std::deque<std::string> scalar_buffers_;

0 commit comments

Comments
 (0)