-
Notifications
You must be signed in to change notification settings - Fork 0
Sql tokenizer #36
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Sql tokenizer #36
Changes from all commits
Commits
Show all changes
5 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,2 @@ | ||
| add_library(sql tokenizer.cpp) | ||
| target_include_directories(sql PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,69 @@ | ||
| #pragma once | ||
|
|
||
| #include <algorithm> | ||
| #include <cctype> | ||
| #include <cstdint> | ||
| #include <set> | ||
| #include <string> | ||
| #include <unordered_map> | ||
|
|
||
| enum class TokenType : std::uint8_t { | ||
| // keywords | ||
| SELECT, | ||
| FROM, | ||
|
|
||
| // literals | ||
| LONG, | ||
| DOUBLE, | ||
| STRING, | ||
| IDENTIFIER, | ||
|
|
||
| // Symbol | ||
| STAR, | ||
| COMMA, | ||
| }; | ||
|
|
||
| namespace Type | ||
| { | ||
|
|
||
| inline TokenType from_string(std::string token) | ||
| { | ||
|
|
||
| std::ranges::transform(token, token.begin(), [](unsigned char letter) { return std::toupper(letter); }); | ||
|
|
||
| static const std::unordered_map<std::string, TokenType> keywords = { | ||
| {"SELECT", TokenType::SELECT}, {"FROM", TokenType::FROM}, {"*", TokenType::STAR}, {",", TokenType::COMMA}}; | ||
|
|
||
| auto iter = keywords.find(token); | ||
| return iter != keywords.end() ? iter->second : TokenType::IDENTIFIER; | ||
| } | ||
|
|
||
| } // namespace Type | ||
|
|
||
| struct Literal | ||
| { | ||
| static bool is_number_start(unsigned char letter) { return std::isdigit(letter) != 0 || letter == '.'; } | ||
| static bool is_identifier_start(char letter) { return std::isalpha(letter) != 0 || letter == '`'; } | ||
| static bool is_char_start(char letter) { return letter == '\'' || '"' == letter; } | ||
|
|
||
| static bool is_symbol_start(char letter) { return is_symbol(letter); } | ||
| static bool is_symbol(char letter) | ||
| { | ||
| std::set<unsigned char> symbols{'*', ','}; | ||
| return symbols.contains(letter); | ||
| } | ||
|
|
||
|
Comment on lines
+52
to
+55
|
||
| static bool is_identifier_part(char letter) | ||
| { | ||
| return std::isdigit(letter) != 0 || std::isalpha(letter) != 0 || letter == '_'; | ||
| } | ||
|
Comment on lines
+45
to
+59
|
||
| }; | ||
|
|
||
| struct Token | ||
| { | ||
| std::string text_; | ||
| TokenType type_; | ||
| int end_offset_; | ||
|
|
||
| Token(std::string &text, TokenType type, int end_offset) : text_(text), type_(type), end_offset_(end_offset) {} | ||
| }; | ||
|
Comment on lines
+62
to
+69
|
||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,119 @@ | ||
| #include <iostream> | ||
| #include <optional> | ||
| #include <stdexcept> | ||
| #include <string> | ||
| #include <vector> | ||
|
|
||
| #include "token.h" | ||
| #include "tokenizer.h" | ||
|
|
||
| SqlTokenizer::SqlTokenizer(const std::string &sql) : sql_(sql) { offset_ = 0; } | ||
|
|
||
| auto SqlTokenizer::tokenize() -> std::vector<Token> | ||
| { | ||
| std::vector<Token> res; | ||
|
|
||
| auto token = next_token(); | ||
| while (token.has_value()) | ||
| { | ||
| res.push_back(token.value()); | ||
| token = next_token(); | ||
| } | ||
|
|
||
| return res; | ||
| } | ||
|
|
||
| auto SqlTokenizer::next_token() -> std::optional<Token> | ||
| { | ||
| auto offset = skip_whitespace(offset_); | ||
| if (offset > (int)sql_.length() - 1) | ||
| { | ||
| return std::nullopt; | ||
| } | ||
|
|
||
| if (Literal::is_identifier_start(sql_[offset])) | ||
| { | ||
|
jobala marked this conversation as resolved.
|
||
| auto token = scan_identifier(offset); | ||
| offset_ = token.end_offset_; | ||
| return token; | ||
| } | ||
|
|
||
| if (Literal::is_symbol_start(sql_[offset])) | ||
| { | ||
| auto token = scan_symbol(offset); | ||
| offset_ = token.end_offset_; | ||
| return token; | ||
| } | ||
|
|
||
| if (Literal::is_number_start(sql_[offset])) | ||
| { | ||
| throw std::runtime_error("Not Implemented"); | ||
| } | ||
|
|
||
| if (Literal::is_char_start(sql_[offset])) | ||
| { | ||
| throw std::runtime_error("Not Implemented"); | ||
|
Comment on lines
+50
to
+55
|
||
| } | ||
|
|
||
| return std::nullopt; | ||
| } | ||
|
Comment on lines
+53
to
+59
|
||
|
|
||
| auto SqlTokenizer::skip_whitespace(int start_offset) -> int | ||
| { | ||
| auto end_offset = start_offset; | ||
|
|
||
|
jobala marked this conversation as resolved.
|
||
| while (end_offset < (int)sql_.size() && sql_[end_offset] == ' ') | ||
| { | ||
| end_offset += 1; | ||
| } | ||
| return end_offset; | ||
|
Comment on lines
+65
to
+69
|
||
| } | ||
|
|
||
| auto SqlTokenizer::scan_identifier(int start_offset) -> Token | ||
| { | ||
| if (offset_ < (int)sql_.size() && '`' == sql_[offset_]) | ||
| { | ||
| auto end_offset = get_offset_until_terminated_char('`', start_offset); | ||
| auto text = sql_.substr(start_offset, end_offset - start_offset); | ||
| return {text, TokenType::IDENTIFIER, end_offset + 1}; | ||
|
Comment on lines
+74
to
+78
|
||
| } | ||
|
Comment on lines
+72
to
+79
|
||
|
|
||
| auto end_offset = start_offset; | ||
| while (end_offset < (int)sql_.size() && Literal::is_identifier_part(sql_[end_offset])) | ||
| { | ||
| end_offset += 1; | ||
| } | ||
|
|
||
| auto text = sql_.substr(start_offset, end_offset - start_offset); | ||
| auto token_type = Type::from_string(text); | ||
| return {text, token_type, end_offset}; | ||
| } | ||
|
|
||
| auto SqlTokenizer::scan_symbol(int start_offset) -> Token | ||
| { | ||
| auto end_offset = start_offset; | ||
| while (end_offset < (int)sql_.size() && Literal::is_symbol(sql_[end_offset])) | ||
| { | ||
| end_offset += 1; | ||
| } | ||
|
|
||
| auto text = sql_.substr(start_offset, end_offset - start_offset); | ||
| auto token_type = Type::from_string(text); | ||
| return {text, token_type, end_offset}; | ||
| } | ||
|
|
||
| auto SqlTokenizer::get_offset_until_terminated_char(unsigned char terminated, int start_offset) -> int | ||
| { | ||
| auto end_offset = start_offset; | ||
| while (end_offset < (int)sql_.size() && static_cast<unsigned char>(sql_[end_offset]) != terminated) | ||
| { | ||
| end_offset += 1; | ||
| } | ||
|
|
||
|
|
||
| if (end_offset >= (int)sql_.size()) | ||
| { | ||
| throw std::runtime_error("Unterminated delimited identifier"); | ||
| } | ||
|
|
||
| return end_offset; | ||
| } | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,23 @@ | ||
| #pragma once | ||
|
|
||
| #include <optional> | ||
| #include <string> | ||
| #include <vector> | ||
|
|
||
| #include "token.h" | ||
|
|
||
| class SqlTokenizer | ||
| { | ||
| std::string sql_; | ||
| int offset_; | ||
|
|
||
| int skip_whitespace(int start_offset); | ||
| int get_offset_until_terminated_char(unsigned char terminated, int start_offset); | ||
| Token scan_identifier(int start_offset); | ||
| Token scan_symbol(int start_offset); | ||
|
|
||
| public: | ||
| SqlTokenizer(const std::string &sql); | ||
| std::vector<Token> tokenize(); | ||
| std::optional<Token> next_token(); | ||
| }; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,43 @@ | ||
| #include <gtest/gtest.h> | ||
| #include <string> | ||
| #include <vector> | ||
|
|
||
| #include "token.h" | ||
| #include "tokenizer.h" | ||
|
|
||
| TEST(Tokenizer, tokenize_sql_string) | ||
| { | ||
| std::string query = "select * from users"; | ||
|
|
||
| SqlTokenizer tokenizer(query); | ||
| auto tokens = tokenizer.tokenize(); | ||
|
|
||
| std::vector<std::string> res{"select", "*", "from", "users"}; | ||
| std::vector<TokenType> token_types{TokenType::SELECT, TokenType::STAR, TokenType::FROM, TokenType::IDENTIFIER}; | ||
|
|
||
| ASSERT_EQ(4, tokens.size()); | ||
| for (int i = 0; i < (int)tokens.size(); i++) | ||
| { | ||
|
Comment on lines
+19
to
+20
|
||
| ASSERT_EQ(res[i], tokens[i].text_); | ||
| ASSERT_EQ(token_types[i], tokens[i].type_); | ||
| } | ||
| } | ||
|
|
||
| TEST(Tokenizer, tokenize_projected_sql_string) | ||
| { | ||
| std::string query = "select name, age from users"; | ||
|
|
||
| SqlTokenizer tokenizer(query); | ||
| auto tokens = tokenizer.tokenize(); | ||
|
|
||
| std::vector<std::string> res{"select", "name", ",", "age", "from", "users"}; | ||
| std::vector<TokenType> token_types{TokenType::SELECT, TokenType::IDENTIFIER, TokenType::COMMA, | ||
| TokenType::IDENTIFIER, TokenType::FROM, TokenType::IDENTIFIER}; | ||
|
|
||
| ASSERT_EQ(6, tokens.size()); | ||
| for (int i = 0; i < (int)tokens.size(); i++) | ||
| { | ||
| ASSERT_EQ(res[i], tokens[i].text_); | ||
| ASSERT_EQ(token_types[i], tokens[i].type_); | ||
| } | ||
| } | ||
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.