Skip to content

Commit 4f3ac17

Browse files
authored
Revert "Revert "Qualcomm AI Engine Direct - heap profiling at runtime… (pytorch#19710)
… with HTP backend" (pytorch#19705)" This reverts commit 576ed77 which reverted the original heap profiling feature (pytorch#19224). The internal CI test failures that caused the original revert have been fixed by adding @unittest.skip overrides for test_qnn_backend_runtime_option_heap_profile in the simulator test classes. Original commit changeset: 576ed77
1 parent 13a5235 commit 4f3ac17

22 files changed

Lines changed: 539 additions & 70 deletions

backends/qualcomm/debugger/README.md

Lines changed: 77 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ qairt_visualizer.view(reports=[optrace, qhas])
7878
- `model`: Path to your QNN model file (e.g., `path_to_your_model.dlc`).
7979
- **`reports`**: List of report file paths, including the optrace (`optrace.json`) and QHAS (`optrace_qnn_htp_analysis_summary.json`).
8080

81-
Note: Files ending with `.bin ` do not support graph visualization in qairt_visualizer.
81+
Note: Files ending with `.bin` do not support graph visualization in qairt_visualizer.
8282

8383
## Demo
8484

@@ -266,3 +266,79 @@ python -m examples.qualcomm.util_scripts.qnn_intermediate_debugger_demo -b build
266266
3. Does not support graphs with partitions (partial delegation).
267267
4. Does not support LLM models.
268268
5. Does not support graphs with multiple methods.
269+
270+
271+
## ExecuTorch QNN HTP Heap Profiling
272+
273+
Measures DSP memory usage when using context binary models on the HTP backend.
274+
275+
### Introduction
276+
277+
DSP heap profiling is available for `QnnContext_createFromBinary` use-cases. It captures total DSP heap usage at two checkpoints:
278+
279+
- **Before the first context is created** (`before_context_created`)
280+
- **After the last context is freed** (`after_context_freed`)
281+
282+
The difference between the two values represents heap consumed during context execution. The value after freeing is typically equal to or greater than before creation.
283+
284+
### Instructions
285+
286+
#### Run the example test
287+
288+
```bash
289+
python backends/qualcomm/tests/test_qnn_delegate.py \
290+
TestQNNQuantizedUtils.test_qnn_backend_runtime_option_heap_profile \
291+
-b build-android -H ${HOST} -s ${SN} -m ${SOC_MODEL}
292+
```
293+
294+
See [test_qnn_delegate.py](../tests/test_qnn_delegate.py) for the full test implementation.
295+
296+
#### Setting
297+
298+
```python
299+
from executorch.backends.qualcomm.utils.utils import generate_htp_compiler_spec
300+
from executorch.backends.qualcomm.utils.utils import generate_qnn_executorch_compiler_spec
301+
302+
backend_options = generate_htp_compiler_spec(
303+
use_multi_contexts=True,
304+
)
305+
306+
compiler_specs = generate_qnn_executorch_compiler_spec(
307+
soc_model=self.chipset_table[TestQNN.soc_model],
308+
backend_options=backend_options,
309+
profile_level=2,
310+
)
311+
312+
# ...
313+
314+
self.verify_output(
315+
module,
316+
sample_input,
317+
exec_prog,
318+
save_heap_result=True,
319+
)
320+
```
321+
322+
#### Output file format
323+
324+
The result is written to a text file (default: `htp_heap_usage.txt`) with two lines:
325+
326+
```
327+
DSP:before_context_created (bytes), <value>
328+
DSP:after_context_freed (bytes), <value>
329+
```
330+
331+
#### Reference result
332+
333+
Measured on SM8850. A difference of 0 means no additional heap is consumed during context binary execution.
334+
335+
```console
336+
First value (before_context_created): 928212 bytes
337+
Second value (after_context_freed): 928212 bytes
338+
difference: 0.00 bytes
339+
```
340+
341+
### Limitations
342+
343+
1. Only supported HTP backend on Android and QNX platforms.
344+
2. By enabling this feature, initialization and cleanup time might be impacted.

backends/qualcomm/export_utils.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -493,6 +493,11 @@ def pull_debug_output(self, etdump_path, debug_ouput_path, callback=None):
493493
if callback:
494494
callback()
495495

496+
def pull_heap_output(self, src_file_path, dst_folder, callback=None):
497+
self._adb(["pull", src_file_path, dst_folder])
498+
if callback:
499+
callback()
500+
496501

497502
def build_executorch_binary(
498503
model: torch.nn.Module, # noqa: B006

backends/qualcomm/runtime/QnnBackendOptions.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,14 @@ template QnnExecuTorchProfileLevel get_option<QnnExecuTorchProfileLevel>(
5252
QnnExecuTorchProfileLevel,
5353
const char*);
5454

55+
executorch::runtime::Error get_runtime_option(
56+
const char* key,
57+
executorch::runtime::BackendOption& backend_option) {
58+
std::strncpy(backend_option.key, key, runtime::kMaxOptionKeyLength);
59+
backend_option.key[runtime::kMaxOptionKeyLength - 1] = '\0';
60+
return get_option(QNN_BACKEND, backend_option);
61+
}
62+
5563
} // namespace qnn
5664
} // namespace backends
5765
} // namespace executorch

backends/qualcomm/runtime/QnnBackendOptions.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,19 @@ struct RuntimeOption {
3737
template <typename T>
3838
T get_option(T aot_option, const char* aot_key);
3939

40+
/**
41+
* @brief
42+
* Get the backend option.
43+
* This method checks runtime option only.
44+
*
45+
* @param key The key of runtime option.
46+
* @param backend_option The backend_option to be restored in runtime.
47+
*/
48+
49+
executorch::runtime::Error get_runtime_option(
50+
const char* key,
51+
executorch::runtime::BackendOption& backend_option);
52+
4053
} // namespace qnn
4154
} // namespace backends
4255
} // namespace executorch

backends/qualcomm/runtime/QnnExecuTorch.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#define QNN_RUNTIME_LPAI_CLIENT_PERF_TYPE "qnn_runtime_lpai_client_perf_type"
2626
#define QNN_RUNTIME_LPAI_AFFINITY "qnn_runtime_lpai_affinity"
2727
#define QNN_RUNTIME_LPAI_CORE_SELECTION "qnn_runtime_lpai_core_selection"
28+
#define QNN_RUNTIME_HEAP_PROFILING_PATH "qnn_runtime_heap_profiling_path"
2829

2930
#ifdef __cplusplus
3031
extern "C" {

backends/qualcomm/runtime/QnnExecuTorchBackend.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -245,6 +245,13 @@ executorch::runtime::Error QnnExecuTorchBackend::set_option(
245245
qnn_runtime_lpai_core_selection_.value = *val;
246246
qnn_runtime_lpai_core_selection_.is_set = true;
247247
}
248+
} else if (strcmp(option.key, QNN_RUNTIME_HEAP_PROFILING_PATH) == 0) {
249+
if (auto* val =
250+
std::get_if<std::array<char, runtime::kMaxOptionValueLength>>(
251+
&option.value)) {
252+
qnn_runtime_heap_profiling_path_.value = *val;
253+
qnn_runtime_heap_profiling_path_.is_set = true;
254+
}
248255
} else {
249256
ET_LOG(
250257
Error,
@@ -268,6 +275,7 @@ executorch::runtime::Error QnnExecuTorchBackend::get_option(
268275
executorch::runtime::BackendOptionContext& context,
269276
executorch::runtime::Span<executorch::runtime::BackendOption>&
270277
backend_options) {
278+
std::lock_guard<std::mutex> guard(runtime_option_mutex_);
271279
size_t matches = backend_options.size();
272280
for (size_t i = 0; i < backend_options.size(); ++i) {
273281
// Set the value to what was stored by set_option
@@ -303,6 +311,10 @@ executorch::runtime::Error QnnExecuTorchBackend::get_option(
303311
strcmp(backend_options[i].key, QNN_RUNTIME_LPAI_CORE_SELECTION) == 0 &&
304312
qnn_runtime_lpai_core_selection_.is_set) {
305313
backend_options[i].value = qnn_runtime_lpai_core_selection_.value;
314+
} else if (
315+
strcmp(backend_options[i].key, QNN_RUNTIME_HEAP_PROFILING_PATH) == 0 &&
316+
qnn_runtime_heap_profiling_path_.is_set) {
317+
backend_options[i].value = qnn_runtime_heap_profiling_path_.value;
306318
} else {
307319
// either runtime never called set_option or key does not exist
308320
matches--;

backends/qualcomm/runtime/QnnExecuTorchBackend.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ class QnnExecuTorchBackend final
7171
RuntimeOption qnn_runtime_lpai_client_perf_type_{false, 0};
7272
RuntimeOption qnn_runtime_lpai_affinity_{false, 0};
7373
RuntimeOption qnn_runtime_lpai_core_selection_{false, 0};
74+
RuntimeOption qnn_runtime_heap_profiling_path_{false, {}};
7475
};
7576

7677
} // namespace qnn

backends/qualcomm/runtime/backends/QnnBackendFactory.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,8 @@ std::unique_ptr<BackendConfigParameters> QnnBackendFactory::Create(
7171
qnn_device_ptr,
7272
backend_params->qnn_backend_cache_ptr_.get(),
7373
htp_options,
74-
qnn_dlc_manager);
74+
qnn_dlc_manager,
75+
get_option(options->profile_level(), QNN_RUNTIME_PROFILE_LEVEL));
7576

7677
backend_params->qnn_graph_ptr_ = std::make_unique<HtpGraph>(
7778
implementation_ptr,

backends/qualcomm/runtime/backends/QnnContextCommon.cpp

Lines changed: 70 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,26 +6,63 @@
66
* LICENSE file in the root directory of this source tree.
77
*/
88

9+
#include <executorch/backends/qualcomm/runtime/QnnBackendOptions.h>
910
#include <executorch/backends/qualcomm/runtime/backends/QnnContextCommon.h>
1011
#include <executorch/backends/qualcomm/runtime/backends/QnnDlcManager.h>
1112

1213
namespace executorch {
1314
namespace backends {
1415
namespace qnn {
1516

17+
std::mutex QnnContext::htp_context_mutex_;
18+
int QnnContext::htp_context_count_{0};
19+
20+
void QnnContext::WriteHeapProfile() {
21+
executorch::runtime::BackendOption backend_option;
22+
std::string heap_profiling_path;
23+
if (get_runtime_option(QNN_RUNTIME_HEAP_PROFILING_PATH, backend_option) ==
24+
Error::Ok) {
25+
auto* arr = std::get_if<std::array<char, runtime::kMaxOptionValueLength>>(
26+
&backend_option.value);
27+
if (arr) {
28+
heap_profiling_path = arr->data();
29+
}
30+
}
31+
Qnn_ErrorHandle_t error_profile =
32+
qnn_profiler_->ProfileDataToFile(heap_profiling_path);
33+
if (error_profile != QNN_SUCCESS) {
34+
QNN_EXECUTORCH_LOG_ERROR(
35+
"Failed to profile. Cannot get profile from handle. Error %d",
36+
QNN_GET_ERROR_CODE(error_profile));
37+
}
38+
}
39+
1640
QnnContext::~QnnContext() {
1741
const QnnInterface& qnn_interface = implementation_->GetQnnInterface();
1842
Qnn_ErrorHandle_t error = QNN_SUCCESS;
43+
1944
if (handle_ != nullptr) {
2045
QNN_EXECUTORCH_LOG_INFO("Destroy Qnn context");
21-
error = qnn_interface.qnn_context_free(handle_, /*profile=*/nullptr);
46+
47+
bool do_heap_profile = false;
48+
{
49+
std::lock_guard<std::mutex> lock(htp_context_mutex_);
50+
if (is_htp_backend_ && htp_context_count_ > 0 && need_to_profile_) {
51+
--htp_context_count_;
52+
do_heap_profile = (htp_context_count_ == 0);
53+
}
54+
}
55+
error = qnn_interface.qnn_context_free(
56+
handle_, do_heap_profile ? qnn_profiler_->GetHandle() : nullptr);
2257
if (error != QNN_SUCCESS) {
2358
QNN_EXECUTORCH_LOG_ERROR(
2459
"Failed to free QNN "
2560
"context_handle_. Backend "
2661
"ID %u, error %d",
2762
qnn_interface.GetBackendId(),
2863
QNN_GET_ERROR_CODE(error));
64+
} else if (do_heap_profile) {
65+
WriteHeapProfile();
2966
}
3067
handle_ = nullptr;
3168
}
@@ -45,21 +82,51 @@ Error QnnContext::Configure() {
4582
if (cache_->GetCacheState() == QnnBackendCache::DESERIALIZE) {
4683
const QnnExecuTorchContextBinary& qnn_context_blob =
4784
cache_->GetQnnContextBlob();
85+
/*
86+
Total DSP heap usage can be measured in two conditions, first context
87+
creation and last context free. By the QNN documentation, we need to insert
88+
profileHandle in qnn_context_create_from_binary when creating first context
89+
and closing last context.
90+
91+
Limitations are two:
92+
1.Only supported on Android and QNX platforms.
93+
2.By enabling this feature initialization and cleanup time might be
94+
impacted.
95+
*/
96+
97+
bool do_heap_profile = false;
98+
{
99+
std::lock_guard<std::mutex> lock(htp_context_mutex_);
100+
do_heap_profile =
101+
is_htp_backend_ && (htp_context_count_ == 0) && need_to_profile_;
102+
if (is_htp_backend_) {
103+
++htp_context_count_;
104+
}
105+
}
48106

49107
error = qnn_interface.qnn_context_create_from_binary(
50108
backend_->GetHandle(),
51109
device_->GetHandle(),
52-
temp_context_config.empty() ? nullptr : temp_context_config.data(),
110+
(temp_context_config.empty() ? nullptr : temp_context_config.data()),
53111
static_cast<uint8_t*>(qnn_context_blob.buffer),
54112
qnn_context_blob.nbytes,
55113
&handle_,
56-
/*profile=*/nullptr);
114+
do_heap_profile ? qnn_profiler_->GetHandle() : nullptr);
57115
if (error != QNN_SUCCESS) {
58116
QNN_EXECUTORCH_LOG_ERROR(
59117
"Can't create context from "
60118
"binary. Error %d.",
61119
QNN_GET_ERROR_CODE(error));
120+
// Rollback the count since context creation failed
121+
{
122+
std::lock_guard<std::mutex> lock(htp_context_mutex_);
123+
if (is_htp_backend_ && htp_context_count_ > 0) {
124+
--htp_context_count_;
125+
}
126+
}
62127
return Error::Internal;
128+
} else if (do_heap_profile) {
129+
WriteHeapProfile();
63130
}
64131
} else if (
65132
cache_->GetCacheState() == QnnBackendCache::SERIALIZE ||

backends/qualcomm/runtime/backends/QnnContextCommon.h

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,10 @@
1313
#include <executorch/backends/qualcomm/runtime/backends/QnnCustomProtocol.h>
1414
#include <executorch/backends/qualcomm/runtime/backends/QnnDeviceCommon.h>
1515

16+
#include <executorch/backends/qualcomm/runtime/backends/QnnProfiler.h>
17+
1618
#include <memory>
19+
#include <mutex>
1720

1821
namespace executorch {
1922
namespace backends {
@@ -28,13 +31,22 @@ class QnnContext {
2831
QnnBackend* backend,
2932
QnnDevice* device,
3033
QnnBackendCache* cache,
31-
QnnDlcManager* qnn_dlc_manager)
34+
QnnDlcManager* qnn_dlc_manager,
35+
const QnnExecuTorchProfileLevel& profile_level)
3236
: handle_(nullptr),
3337
implementation_(implementation),
3438
backend_(backend),
3539
device_(device),
3640
cache_(cache),
37-
qnn_dlc_manager_(qnn_dlc_manager) {}
41+
qnn_dlc_manager_(qnn_dlc_manager),
42+
is_htp_backend_(
43+
implementation->GetQnnInterface().GetBackendId() ==
44+
QNN_BACKEND_ID_HTP),
45+
need_to_profile_(
46+
profile_level != QnnExecuTorchProfileLevel::kProfileOff) {
47+
qnn_profiler_ =
48+
std::make_unique<QnnProfile>(implementation_, backend_, profile_level);
49+
}
3850

3951
virtual ~QnnContext();
4052

@@ -73,13 +85,20 @@ class QnnContext {
7385
};
7486

7587
private:
88+
void WriteHeapProfile();
7689
Qnn_ContextHandle_t handle_;
7790
QnnImplementation* implementation_;
7891
QnnBackend* backend_;
7992
QnnDevice* device_;
8093
QnnBackendCache* cache_;
8194
QnnContextCustomProtocol qnn_context_custom_protocol_;
8295
QnnDlcManager* qnn_dlc_manager_;
96+
97+
std::unique_ptr<QnnProfile> qnn_profiler_;
98+
bool is_htp_backend_;
99+
bool need_to_profile_;
100+
static std::mutex htp_context_mutex_;
101+
static int htp_context_count_;
83102
};
84103
} // namespace qnn
85104
} // namespace backends

0 commit comments

Comments
 (0)