2222
2323#include < onnxruntime_cxx_api.h>
2424
25- // #include <kaldi-native-fbank/csrc/feature-fbank.h>
2625#include " kaldi-native-fbank/csrc/online-feature.h"
27- // #include <kaldi-native-fbank/csrc/feature-window.h>
2826
2927using namespace StackFlows ;
3028
@@ -214,208 +212,30 @@ class llm_task {
214212 return triggered;
215213 }
216214
217- std::vector<std::vector<float >> compute_fbank_kaldi1 (const std::vector<float > &waveform, int sample_rate,
218- int num_mel_bins)
219- {
220- fbank_.reset ();
221- fbank_ = std::make_unique<knf::OnlineFbank>(opts_);
222-
223- fbank_->AcceptWaveform (sample_rate, waveform.data (), waveform.size ());
224-
225- int num_frames = fbank_->NumFramesReady ();
226-
227- std::vector<std::vector<float >> features;
228- SLOGE (" num_frames %d" , num_frames);
229- features.reserve (num_frames);
230-
231- for (int i = 0 ; i < num_frames; ++i) {
232- const float *frame_data = fbank_->GetFrame (i);
233- std::vector<float > frame (frame_data, frame_data + num_mel_bins);
234- features.push_back (std::move (frame));
235- }
236-
237- return features;
238- }
239-
240215 std::vector<std::vector<float >> compute_fbank_kaldi (const std::vector<float > &waveform, int sample_rate,
241216 int num_mel_bins)
242217 {
243218 fbank_.reset ();
244219 fbank_ = std::make_unique<knf::OnlineFbank>(opts_);
245220 fbank_->AcceptWaveform (sample_rate, waveform.data (), waveform.size ());
246221 int num_frames = fbank_->NumFramesReady ();
222+
247223 std::vector<std::vector<float >> features;
248- SLOGE ( " num_frames %d " , num_frames);
224+
249225 features.reserve (num_frames);
250226 for (int i = 0 ; i < num_frames; ++i) {
251227 const float *frame_data = fbank_->GetFrame (i);
252228 std::vector<float > frame (frame_data, frame_data + num_mel_bins);
253229 features.push_back (std::move (frame));
254230 }
255231
256- // 生成文件名序号(三位数,补零)
257- // std::stringstream ss;
258- // ss << std::setfill('0') << std::setw(3) << file_counter;
259- // std::string file_suffix = ss.str();
260-
261- // 保存 waveform 为二进制
262- // std::string waveform_filename = "waveform_" + file_suffix + ".bin";
263- // std::ofstream waveform_bin(waveform_filename, std::ios::binary);
264- // if (waveform_bin.is_open()) {
265- // size_t size = waveform.size();
266- // waveform_bin.write(reinterpret_cast<const char *>(&size), sizeof(size));
267- // waveform_bin.write(reinterpret_cast<const char *>(waveform.data()), sizeof(float) * waveform.size());
268- // waveform_bin.close();
269- // SLOGE("Waveform saved to %s", waveform_filename.c_str());
270- // }
271-
272- // 保存 features 为二进制
273- // std::string features_filename = "features_" + file_suffix + ".bin";
274- // std::ofstream features_bin(features_filename, std::ios::binary);
275- // if (features_bin.is_open()) {
276- // size_t rows = features.size();
277- // size_t cols = (rows > 0) ? features[0].size() : 0;
278- // features_bin.write(reinterpret_cast<const char *>(&rows), sizeof(rows));
279- // features_bin.write(reinterpret_cast<const char *>(&cols), sizeof(cols));
280- // for (const auto &row : features) {
281- // features_bin.write(reinterpret_cast<const char *>(row.data()), sizeof(float) * row.size());
282- // }
283- // features_bin.close();
284- // SLOGE("Features saved to %s", features_filename.c_str());
285- // }
286-
287- // 递增计数器
288- // file_counter++;
289-
290232 return features;
291233 }
292234
293- std::vector<std::vector<float >> read_mat_from_bin (int index)
294- {
295- std::stringstream ss;
296- ss << " mat_" << std::setfill (' 0' ) << std::setw (3 ) << index << " .bin" ;
297- std::string filename = ss.str ();
298-
299- std::ifstream file (filename, std::ios::binary);
300- if (!file.is_open ()) {
301- SLOGE (" 无法打开文件: %s" , filename.c_str ());
302- return {};
303- }
304-
305- // 读取维度信息
306- int64_t rows, cols;
307- file.read (reinterpret_cast <char *>(&rows), sizeof (int64_t ));
308- file.read (reinterpret_cast <char *>(&cols), sizeof (int64_t ));
309-
310- SLOGE (" 读取mat文件 %s, shape: (%ld, %ld)" , filename.c_str (), rows, cols);
311-
312- // 读取数据
313- std::vector<float > data (rows * cols);
314- file.read (reinterpret_cast <char *>(data.data ()), sizeof (float ) * data.size ());
315- file.close ();
316-
317- // 转换为二维向量
318- std::vector<std::vector<float >> result;
319- result.reserve (rows);
320-
321- for (int i = 0 ; i < rows; ++i) {
322- std::vector<float > row;
323- row.reserve (cols);
324- for (int j = 0 ; j < cols; ++j) {
325- row.push_back (data[i * cols + j]);
326- }
327- result.push_back (std::move (row));
328- }
329-
330- return result;
331- }
332-
333- std::vector<float > run_inference1 (const std::vector<float > &audio_chunk_16k)
334- {
335- auto fbank_feats = compute_fbank_kaldi (audio_chunk_16k, RESAMPLE_RATE, FEAT_DIM);
336- SLOGE (" === FBank Features ===" );
337- for (int i = 0 ; i < fbank_feats.size (); ++i) {
338- SLOGE (" Frame %d:" , i);
339- std::string frame_str = " " ;
340- for (int j = 0 ; j < fbank_feats[i].size (); ++j) {
341- frame_str += std::to_string (fbank_feats[i][j]) + " " ;
342- }
343- SLOGE (" %s" , frame_str.c_str ());
344- }
345- SLOGE (" fbank_feats.size()=%d" , (int )fbank_feats.size ());
346- SLOGE (" =======================" );
347-
348- if (fbank_feats.empty ()) {
349- return {};
350- }
351-
352- int T = fbank_feats.size ();
353- std::vector<float > mat_flattened;
354- for (const auto &feat : fbank_feats) {
355- mat_flattened.insert (mat_flattened.end (), feat.begin (), feat.end ());
356- }
357-
358- std::vector<int64_t > input_shape = {1 , static_cast <int64_t >(T), FEAT_DIM};
359- std::vector<int64_t > cache_shape = {1 , 32 , 88 };
360-
361- Ort::MemoryInfo memory_info = Ort::MemoryInfo::CreateCpu (OrtDeviceAllocator, OrtMemTypeDefault);
362-
363- Ort::Value input_tensor = Ort::Value::CreateTensor<float >(
364- memory_info, mat_flattened.data (), mat_flattened.size (), input_shape.data (), input_shape.size ());
365-
366- Ort::Value cache_tensor = Ort::Value::CreateTensor<float >(memory_info, cache.data (), cache.size (),
367- cache_shape.data (), cache_shape.size ());
368-
369- const char *input_names[] = {" input" , " cache" };
370- const char *output_names[] = {" output" , " r_cache" };
371-
372- std::vector<Ort::Value> inputs;
373- inputs.push_back (std::move (input_tensor));
374- inputs.push_back (std::move (cache_tensor));
375-
376- auto output_tensors = session->Run (Ort::RunOptions{nullptr }, input_names, inputs.data (), 2 , output_names, 2 );
377-
378- float *out_data = output_tensors[0 ].GetTensorMutableData <float >();
379- float *cache_out_data = output_tensors[1 ].GetTensorMutableData <float >();
380-
381- std::vector<int64_t > out_shape = output_tensors[0 ].GetTensorTypeAndShapeInfo ().GetShape ();
382- size_t out_size = 1 ;
383- for (auto dim : out_shape) out_size *= dim;
384-
385- std::vector<float > out_chunk (out_data, out_data + out_size);
386-
387- std::copy (cache_out_data, cache_out_data + cache.size (), cache.begin ());
388-
389- return out_chunk;
390- }
391-
392235 std::vector<float > run_inference (const std::vector<float > &audio_chunk_16k)
393236 {
394237 std::vector<std::vector<float >> fbank_feats;
395- static int file_index = -1 ;
396- // file_index++;
397- if (file_index >= 0 ) {
398- // 从文件读取
399- fbank_feats = read_mat_from_bin (file_index);
400- SLOGE (" === 从文件读取的 FBank Features ===" );
401- } else {
402- // 原来的计算方式
403- fbank_feats = compute_fbank_kaldi (audio_chunk_16k, RESAMPLE_RATE, FEAT_DIM);
404- SLOGE (" === 计算的 FBank Features ===" );
405- }
406- if (file_index == 6 ) file_index = 0 ;
407-
408- // 打印特征用于对比
409- // for (int i = 0; i < fbank_feats.size(); ++i) {
410- // SLOGE("Frame %d:", i);
411- // std::string frame_str = "";
412- // for (int j = 0; j < fbank_feats[i].size(); ++j) {
413- // frame_str += std::to_string(fbank_feats[i][j]) + " ";
414- // }
415- // SLOGE("%s", frame_str.c_str());
416- // }
417- // SLOGE("fbank_feats.size()=%d", (int)fbank_feats.size());
418- // SLOGE("=======================");
238+ fbank_feats = compute_fbank_kaldi (audio_chunk_16k, RESAMPLE_RATE, FEAT_DIM);
419239
420240 if (fbank_feats.empty ()) {
421241 return {};
@@ -461,9 +281,6 @@ class llm_task {
461281 void sys_pcm_on_data (const std::string &raw)
462282 {
463283 static int count = 0 ;
464- static std::ofstream ffout (" audio_float.raw" , std::ios::binary | std::ios::app);
465- static std::ofstream fout (" audio_int16.raw" , std::ios::binary | std::ios::app);
466-
467284 if (count < delay_audio_frame_) {
468285 buffer_write_char (pcmdata, raw.data (), raw.length ());
469286 count++;
@@ -487,18 +304,9 @@ class llm_task {
487304 buffer_resize (pcmdata, 0 );
488305 count = 0 ;
489306
490- if (!floatSamples.empty ()) {
491- ffout.write (reinterpret_cast <const char *>(floatSamples.data ()), floatSamples.size () * sizeof (float ));
492- }
493-
494- if (!int16Samples.empty ()) {
495- fout.write (reinterpret_cast <const char *>(int16Samples.data ()), int16Samples.size () * sizeof (int16_t ));
496- }
497-
498307 auto scores = run_inference (floatSamples);
499308 if (detect_wakeup (scores)) {
500309 if (enwake_audio_ && (!wake_wav_file_.empty ()) && play_awake_wav) {
501- SLOGE (" \n\n\n\n\n " );
502310 play_awake_wav (wake_wav_file_);
503311 }
504312 if (out_callback_) out_callback_ (" " , true );
0 commit comments