Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 28 additions & 15 deletions examples/llama_qnn_aot/aot_run.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@
#include <string>
#include "mllm/backends/qnn/aot_rt/QnnAOTRuntime.hpp"
#include "configuration_llama3.hpp"
#include "mllm/models/llama/tokenization_tiny_llama.hpp"
#include "mllm/models/qwen3/tokenization_qwen3.hpp"
#include "mllm/models/llama/tokenization_llama.hpp"

using mllm::Argparse;
using namespace mllm::qnn::aot; // NOLINT
Expand All @@ -16,8 +15,8 @@ MLLM_MAIN({
auto& tokenizer_path = Argparse::add<std::string>("-t|--tokenizer").help("Tokenizer path").def("tokenizer.json");
auto& config_path = Argparse::add<std::string>("-c|--config").help("Config path").required(true);
auto& ar_len = Argparse::add<int>("--ar_len").help("Autoregressive length (chunk size)").def(128);
auto& seq_len = Argparse::add<int>("--seq_len").help("Input sequence length").def(800);
auto& gen_len = Argparse::add<int>("--gen_len").help("Generate token length").def(32);
// auto& seq_len = Argparse::add<int>("--seq_len").help("Input sequence length").def(800);
// auto& gen_len = Argparse::add<int>("--gen_len").help("Generate token length").def(32);

Argparse::parse(argc, argv);

Expand All @@ -37,31 +36,45 @@ MLLM_MAIN({
config.vocab_size = llama_cfg.vocab_size;
config.context_len = 1024;
config.ar_len = ar_len.get();
config.type = "llama3";

// Note: Using Qwen3 tokenizer as a placeholder.
// For production use, you should implement a Llama3Tokenizer or use
// the appropriate tokenizer for your model.
auto tokenizer = mllm::models::llama::TinyLlamaTokenizer(tokenizer_path.get());
auto tokenizer = mllm::models::llama::LlamaTokenizer(tokenizer_path.get());
Comment on lines 41 to +44
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor | ⚡ Quick win

Remove stale tokenizer comment.

Lines 41–43 still say Qwen3 tokenizer placeholder, but Line 44 instantiates LlamaTokenizer. Please update/remove this comment to avoid maintenance confusion.

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@examples/llama_qnn_aot/aot_run.cpp` around lines 41 - 44, The comment above
the tokenizer instantiation is stale — remove or update the lines that mention
"Using Qwen3 tokenizer as a placeholder" and instead document that
LlamaTokenizer is being used; specifically edit the comment that precedes the
creation of mllm::models::llama::LlamaTokenizer(tokenizer_path.get()) so it
correctly reflects the use of LlamaTokenizer (or notes to implement a
Llama3Tokenizer if different) and remove any leftover references to Qwen3 to
avoid confusion.


auto input_tensor = tokenizer.convertMessage({{
.role = "user",
.content = "hello",
}});
// auto input_tensor = tokenizer.convertMessage({{
// .role = "user",
// .content = "hello",
// }});

input_tensor["sequence"] = mllm::Tensor::arange(0, seq_len.get(), 1, mllm::kInt64, mllm::kCPU).view({1, -1});
// input_tensor["sequence"] = mllm::Tensor::arange(0, seq_len.get(), 1, mllm::kInt64, mllm::kCPU).view({1, -1});

// DBG:
mllm::print(input_tensor["sequence"].shape());
mllm::print(input_tensor["sequence"]);
// // DBG:
// mllm::print(input_tensor["sequence"].shape());
// mllm::print(input_tensor["sequence"]);

// Runner runner(config, &tokenizer);
// if (!runner.load()) {
// std::cerr << "Failed to load model\n";
// return 1;
// }


std::string prompt_text;
fmt::print("💬 Prompt text (or 'exit/quit'): ");
std::getline(std::cin, prompt_text);

auto input_tensor = tokenizer.convertMessage({{.role = "user", .content = prompt_text}});

Comment on lines +65 to 69
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor | ⚡ Quick win

Implement the advertised exit/quit path.

The prompt text says exit/quit (Line 65), but input is always sent to generation. Add an early return when the user enters exit or quit.

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@examples/llama_qnn_aot/aot_run.cpp` around lines 65 - 69, The prompt loop
currently always sends input to tokenizer.convertMessage; add an early-exit
check after reading prompt_text that returns or breaks when the user enters
"exit" or "quit" (case-insensitive and trimming surrounding whitespace), and
also handle EOF (std::cin.eof()) similarly; implement this check immediately
before calling tokenizer.convertMessage so prompt_text is not processed when the
user intends to quit.

Runner runner(config, &tokenizer);
if (!runner.load()) {
std::cerr << "Failed to load model\n";
return 1;
}

runner.generate(
input_tensor["sequence"], gen_len.get(), [](const std::string& token) { std::cout << token << std::flush; }, true);
runner.generate(input_tensor["sequence"], config.context_len,
[](const std::string& token) { std::cout << token << std::flush; });
std::cout << "\n";

return 0;
Expand Down
3 changes: 3 additions & 0 deletions mllm/backends/qnn/aot_rt/QnnAOTConfig.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,14 @@

#pragma once

#include <string>
#include "mllm/core/DataTypes.hpp"

namespace mllm::qnn::aot {

struct QnnAOTConfig {
std::string type = "qwen3";

int num_layers = 28;
int num_heads = 12;
int head_dim = 128;
Expand Down
18 changes: 16 additions & 2 deletions mllm/backends/qnn/aot_rt/QnnAOTRuntime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,22 @@ bool Runner::load() {
// init token generator(decode)
// TODO: EOS IDs
auto eos_ids = std::make_unique<std::unordered_set<uint64_t>>();
eos_ids->insert(151643);
eos_ids->insert(151645);
// eos_ids->insert(151643);
// eos_ids->insert(151645);

// Dynamically determine the currently loaded model based on the model name.
if (config_.type == "llama3") {
eos_ids->insert(128001); // <|end_of_text|>
eos_ids->insert(128008); // <|eom_id|>
eos_ids->insert(128009); // <|eot_id|>
} else if (config_.type == "qwen2"){
eos_ids->insert(151643);
eos_ids->insert(151645);
} else{
// qwen3
eos_ids->insert(151643);
eos_ids->insert(151645);
}

token_generator_ = std::make_unique<TokenGenerator<uint8_t>>(tokenizer_, kv_manager_.get(), std::move(eos_ids), config_);

Expand Down
245 changes: 245 additions & 0 deletions mllm/models/llama/tokenization_llama.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,245 @@
// Copyright (c) MLLM Team.
// Licensed under the MIT License.
#pragma once

#include <vector>
#include <string>
#include <ctime>
#include <iomanip>
#include <sstream>
#include <unordered_map>

Comment on lines +5 to +11
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

🧩 Analysis chain

🏁 Script executed:

cat -n mllm/models/llama/tokenization_llama.hpp | head -150

Repository: UbiquitousLearning/mllm

Length of output: 6309


Add the missing wide-char ctype header for std::iswspace.

std::iswspace is used at lines 62, 63, 75, and 77 but <cwctype> is not explicitly included. While the code may compile on some platforms due to transitive includes, this violates portability best practices and C++ idioms. All headers for functions used must be explicitly included.

Proposed fix
 `#include` <vector>
 `#include` <string>
+#include <cwctype>
 `#include` <ctime>
 `#include` <iomanip>
 `#include` <sstream>
 `#include` <unordered_map>
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
#include <vector>
#include <string>
#include <ctime>
#include <iomanip>
#include <sstream>
#include <unordered_map>
`#include` <vector>
`#include` <string>
`#include` <cwctype>
`#include` <ctime>
`#include` <iomanip>
`#include` <sstream>
`#include` <unordered_map>
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@mllm/models/llama/tokenization_llama.hpp` around lines 5 - 11, The header
tokenization_llama.hpp uses std::iswspace (references at tokenization functions
around lines using iswspace) but doesn't include the wide-char ctype header; add
an explicit `#include` <cwctype> to tokenization_llama.hpp so std::iswspace is
guaranteed to be declared (update the top-of-file includes alongside <string>,
<vector>, etc.), then rebuild to confirm portability on platforms that don't
provide it via transitive includes.

#include "mllm/preprocessor/tokenizers/BPE.hpp"
#include "mllm/models/ARGeneration.hpp"
#include "mllm/preprocessor/tokenizers/Unicode.hpp"
#include "mllm/preprocessor/tokenizers/AutoTokenizer.hpp"

namespace mllm::models::llama {

// 适配 Llama 3 的正则切分逻辑
inline bool llama3TokenizerMatchPattern(const std::wstring& str, size_t& pos, std::wstring& matched) {
if (pos >= str.size()) return false;

// 1. 匹配缩写
static const std::wstring contractions[] = {L"'s", L"'t", L"'re", L"'ve", L"'m", L"'ll", L"'d", L"'S", L"'T", L"'RE", L"'VE", L"'M", L"'LL", L"'D"};
for (const auto& contraction : contractions) {
if (pos + contraction.size() <= str.size() && str.compare(pos, contraction.size(), contraction) == 0) {
matched = contraction;
pos += contraction.size();
return true;
}
}

// 2. 匹配字母
{
size_t original_pos = pos;
matched.clear();
if (!preprocessor::isLetter(str[pos]) && !preprocessor::isDigit(str[pos]) && str[pos] != L'\r' && str[pos] != L'\n') {
matched += str[pos];
++pos;
}
if (pos < str.size() && preprocessor::isLetter(str[pos])) {
do {
matched += str[pos];
++pos;
} while (pos < str.size() && preprocessor::isLetter(str[pos]));
return true;
}
pos = original_pos;
}

// 3. 匹配数字
if (preprocessor::isDigit(str[pos])) {
matched = str.substr(pos, 1);
++pos;
return true;
}

// 4. 匹配符号
{
size_t start = pos;
if (str[pos] == L' ') { ++pos; }
if (pos < str.size() && !std::iswspace(str[pos]) && !preprocessor::isLetter(str[pos]) && !preprocessor::isDigit(str[pos])) {
do { ++pos; } while (pos < str.size() && !std::iswspace(str[pos]) && !preprocessor::isLetter(str[pos]) && !preprocessor::isDigit(str[pos]));
matched = str.substr(start, pos - start);
while (pos < str.size() && (str[pos] == L'\r' || str[pos] == L'\n')) {
matched += str[pos];
++pos;
}
return true;
}
pos = start;
}

// 5. 匹配空格
if (std::iswspace(str[pos])) {
size_t start = pos;
while (pos < str.size() && std::iswspace(str[pos])) ++pos;
matched = str.substr(start, pos - start);
return true;
}

return false;
}

inline void llama3Regex(const std::string& str, std::vector<std::wstring>& splitted) {
auto w_string = preprocessor::utf8string2WideString(str);
size_t pos = 0;
while (pos < w_string.size()) {
std::wstring matched;
if (llama3TokenizerMatchPattern(w_string, pos, matched)) {
splitted.push_back(matched);
} else {
++pos;
}
}
}

struct LlamaMessage {
std::string role;
std::string content;
};

// 恢复继承自原版的 AutoTokenizer,满足 aot_run.cpp 的要求
class LlamaTokenizer final : public mllm::preprocessor::AutoTokenizer {
public:
explicit LlamaTokenizer(const std::string& file_path, bool add_bos = true) : add_bos_(add_bos) {
preprocessor::initLocal();
// 恢复内建的字典映射机制
preprocessor::makeBytes2UnicodeMap(bytes_2_unicode_dict_);
for (auto& kv : bytes_2_unicode_dict_) { bytes_2_unicode_dict_inverse_.insert({kv.second, kv.first}); }

bpe_.initFromSentencePieceJson(file_path);

special_tokens_trie_.add(L"<|begin_of_text|>");
special_tokens_trie_.add(L"<|end_of_text|>");
special_tokens_trie_.add(L"<|start_header_id|>");
special_tokens_trie_.add(L"<|end_header_id|>");
special_tokens_trie_.add(L"<|eot_id|>");
}

std::string getSystemPromptPrefix() {
std::time_t t = std::time(nullptr);
std::tm tm_ = *std::localtime(&t);
std::ostringstream oss;
oss << std::put_time(&tm_, "%d %b %Y");
return "Cutting Knowledge Date: December 2023\nToday Date: " + oss.str() + "\n\n";
}
Comment on lines +121 to +127
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

🧩 Analysis chain

🏁 Script executed:

cat mllm/models/llama/tokenization_llama.hpp | head -150 | tail -50

Repository: UbiquitousLearning/mllm

Length of output: 2162


🏁 Script executed:

rg "getSystemPromptPrefix" --type hpp --type cpp -B 2 -A 2

Repository: UbiquitousLearning/mllm

Length of output: 95


🏁 Script executed:

# Check if there are any threading or concurrency patterns in the codebase
rg "std::thread|std::mutex|pthread|`#pragma` omp" mllm/models/llama/ | head -20

Repository: UbiquitousLearning/mllm

Length of output: 49


🏁 Script executed:

# Look for other usages of std::localtime or similar time functions
rg "std::localtime|localtime_r|localtime_s|std::chrono" mllm/models/llama/ --type hpp --type cpp

Repository: UbiquitousLearning/mllm

Length of output: 95


🏁 Script executed:

rg "getSystemPromptPrefix" --max-count 10

Repository: UbiquitousLearning/mllm

Length of output: 266


🏁 Script executed:

rg "applyChatTemplate" --max-count 10

Repository: UbiquitousLearning/mllm

Length of output: 1690


🏁 Script executed:

# Check if there's any concurrent usage or threading in the codebase
rg "std::thread|std::async|std::mutex|concurrent" --max-count 20

Repository: UbiquitousLearning/mllm

Length of output: 8668


🏁 Script executed:

# Check what C++ version is being used
find . -name "CMakeLists.txt" -o -name "*.cmake" | head -5 | xargs grep -l "cxx_standard\|CXX_STANDARD" 2>/dev/null

Repository: UbiquitousLearning/mllm

Length of output: 165


🏁 Script executed:

# Look at the full tokenization_llama.hpp file to understand context better
wc -l mllm/models/llama/tokenization_llama.hpp

Repository: UbiquitousLearning/mllm

Length of output: 112


🏁 Script executed:

cat mllm/CMakeLists.txt | grep -i "cxx_standard\|CXX_STANDARD" -A 2 -B 2

Repository: UbiquitousLearning/mllm

Length of output: 49


🏁 Script executed:

# Check if there are any service/server implementations that might call applyChatTemplate concurrently
rg "LlamaTokenizer|applyChatTemplate" mllm/ -A 3 -B 3 | head -50

Repository: UbiquitousLearning/mllm

Length of output: 4198


🏁 Script executed:

# Check the Service.hpp to understand threading model
head -100 mllm/engine/service/Service.hpp

Repository: UbiquitousLearning/mllm

Length of output: 2221


🌐 Web query:

std::localtime thread safety C++ std::chrono alternative

💡 Result:

The C-style function std::localtime is not thread-safe because it returns a pointer to a shared, statically allocated std::tm object, which can be overwritten by concurrent calls from other threads [1][2][3]. The modern, thread-safe alternative in C++ is the C++20 library, which provides robust support for time zones and calendar conversions [3][4][5]. Recommended C++20 Approach: Use std::chrono::zoned_time to handle local time conversions in a thread-safe manner [3][4][6]. Example: #include #include void print_local_time { auto now = std::chrono::system_clock::now; // zoned_time is thread-safe and handles time zone conversions auto local_now = std::chrono::zoned_time{std::chrono::current_zone, now}; // Use std::format (C++20) for thread-safe, type-safe formatting std::cout << std::format("{:%F %T %Z}", local_now) << std::endl; } Legacy Alternatives (Pre-C++20): If you cannot use C++20, you must use platform-specific thread-safe functions or manual synchronization [1][7][8]: 1. POSIX (Linux/macOS): Use localtime_r(const std::time_t*, std::tm*), which writes the result into a user-provided buffer [1][2][3]. 2. Windows (MSVC): Use localtime_s(std::tm*, const std::time_t*), which also uses a user-provided buffer [1][3][8]. 3. Manual Synchronization: Wrap calls to std::localtime in a std::mutex to prevent concurrent access [1][7][4]. Note: Avoid std::ctime and std::gmtime for the same reasons, as they also rely on shared static buffers [2][8][4].

Citations:


Use thread-safe time functions instead of std::localtime.

std::localtime is not thread-safe as it returns a pointer to a shared static buffer. Since getSystemPromptPrefix() is called from applyChatTemplate(), which is invoked by concurrent worker threads in the Service implementation, this creates a race condition. Replace with std::localtime_r (POSIX) or std::localtime_s (Windows), or use std::chrono for the modern C++ approach.

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@mllm/models/llama/tokenization_llama.hpp` around lines 121 - 127, The
getSystemPromptPrefix() function uses std::localtime (unsafe for concurrent
calls from applyChatTemplate()); replace it with a thread-safe alternative — on
POSIX call std::localtime_r(&t, &tm_) or on Windows std::localtime_s(&tm_, &t)
(or use std::chrono to format the current date) so that tm_ is populated safely
per-thread; update the declaration/initialization of tm_ accordingly and keep
the existing formatting with std::put_time and oss.str() untouched.


inline std::string applyChatTemplate(const std::vector<LlamaMessage>& messages, bool add_generation_prompt = true) {
std::string result = "";
if (add_bos_) result += "<|begin_of_text|>";
for (const auto& msg : messages) {
std::string content = msg.content;
if (msg.role == "system") content = getSystemPromptPrefix() + content;
result += "<|start_header_id|>" + msg.role + "<|end_header_id|>\n\n" + content + "<|eot_id|>";
}
if (add_generation_prompt) result += "<|start_header_id|>assistant<|end_header_id|>\n\n";
return result;
}

std::vector<std::wstring> _tokenize(const std::string& str) override {
std::vector<std::wstring> ret;
std::vector<std::wstring> splitted;
llama3Regex(str, splitted);
for (const auto& s : splitted) {
auto utf_8_str = preprocessor::wideString2Utf8String(s);
std::wstring mapped_str;
// 执行字节映射
for (unsigned char c : utf_8_str) { mapped_str.push_back(bytes_2_unicode_dict_[c]); }
auto bpe_ts = bpe_._bpe(mapped_str);
for (const auto& bpe_t : bpe_ts) { ret.push_back(bpe_t); }
}
return ret;
}

std::vector<std::wstring> tokenize(const std::string& str) override {
std::string processed_str = str;
bool text_has_bos = (processed_str.find("<|begin_of_text|>") == 0);
if (add_bos_ && !text_has_bos) {
processed_str = "<|begin_of_text|>" + processed_str;
}

auto tokens = special_tokens_trie_.split(preprocessor::utf8string2WideString(processed_str));
std::vector<std::wstring> all_tokens;
for (const auto& token : tokens) {
if (special_tokens_trie_.isSpecialToken(token)) {
all_tokens.emplace_back(token);
continue;
}
auto tmp_tokens = _tokenize(preprocessor::wideString2Utf8String(token));
all_tokens.insert(all_tokens.end(), tmp_tokens.begin(), tmp_tokens.end());
}
return all_tokens;
}

std::wstring _detokenize(int64_t pos_idx) override { return bpe_._lookup_inverse_vocab(pos_idx); }

std::wstring detokenize(int64_t pos_idx) override {
auto str = _detokenize(pos_idx);
std::string utf_8_str;
for (wchar_t c : str) {
if (bytes_2_unicode_dict_inverse_.count(c)) {
utf_8_str.push_back((unsigned char)(bytes_2_unicode_dict_inverse_[c]));
} else {
return str;
}
}
return mllm::preprocessor::utf8string2WideString(utf_8_str);
}

Tensor convert2Ids(const std::vector<std::wstring>& strs) override {
std::vector<int64_t> ids;
for (const auto& str : strs) { ids.emplace_back(bpe_._lookup_vocab(str)); }
Tensor ret = Tensor::empty({1, (int32_t)ids.size()}, kInt64, kCPU)
.setMemType(kExtraInput)
.setName("llama-tokenizer-i0")
.alloc();
auto ptr = ret.ptr<int64_t>();
for (size_t i = 0; i < ids.size(); ++i) { ptr[i] = ids[i]; }
return ret;
}

// 供 test_c.cpp 调用的便捷接口
std::vector<int64_t> encode(const std::string& str) {
auto sub_tokens = tokenize(str);
std::vector<int64_t> ret;
for (auto& token : sub_tokens) { ret.emplace_back(bpe_._lookup_vocab(token)); }
return ret;
}

std::string decode(const std::vector<int64_t>& ids) {
std::string ret;
for (auto& each_id : ids) {
auto wstr = detokenize(each_id);
ret += mllm::preprocessor::wideString2Utf8String(wstr);
}
return ret;
}

ARGenerationOutputPast convertMessage(const std::vector<LlamaMessage>& messages) {
auto applied_string = applyChatTemplate(messages, true);
auto sequence_str = tokenize(applied_string);
std::vector<int64_t> ids;
for (const auto& str : sequence_str) { ids.emplace_back(bpe_._lookup_vocab(str)); }

Tensor sequence = Tensor::empty({1, (int32_t)ids.size()}, kInt64, kCPU)
.setMemType(kNormal)
.setName("llama-tokenizer-i0")
.alloc();
auto ptr = sequence.ptr<int64_t>();
for (size_t i = 0; i < ids.size(); ++i) { ptr[i] = ids[i]; }

return {
{"sequence", sequence},
};
}

private:
bool add_bos_ = true;
preprocessor::BPE bpe_;
std::unordered_map<std::wint_t, wchar_t> bytes_2_unicode_dict_;
std::unordered_map<wchar_t, std::wint_t> bytes_2_unicode_dict_inverse_;
};

} // namespace mllm::models::llama
Loading
Loading