Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid
## Unreleased (Planned Release Target: v0.2.1)

### List of Pull Requests
- Improve Profiling [#138](https://github.com/pulp-platform/Deeploy/pull/138)
- FP32 ReduceMean operator improvement [#137](https://github.com/pulp-platform/Deeploy/pull/137)
- Support for RMSNorm (Pow and Sqrt operators) [#136](https://github.com/pulp-platform/Deeploy/pull/136)
- Demo TinyViT compatibility with tiled Siracusa [#124](https://github.com/pulp-platform/Deeploy/pull/124)
Expand Down Expand Up @@ -81,6 +82,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid
- Added new waiting-strategy logic with fine-grained `PerTensorWaitingStrategy`
- PULPClusterEngine now accepts a `n_cores` parameter to set the number of cores used
- annotateNCores method to PULPDeployer that adds an `n_cores` key to all PULPClusterEngine templates' operatorRepresentations
- Calculate non-kernel overhead and show total time spent during profiling

### Changed
- Structure of Tests subdir for improved ordering
Expand Down Expand Up @@ -123,6 +125,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid
- Added missing shape annotation to the testTypeInferenceDifferentTypes
- Refactored DMA code generation (`SnitchDma`, `Mchan`) to correctly overlap transfers and compute in double-buffering mode
- changed `_mapNode` to `_selectEngine` which reduces the responsibility of that function to, as the name states, just engine selection
- Print kernel profiling information for all memory levels

### Fixed
- Fixed ReduceMean parallelization and tiling issues described in Issue [#134](https://github.com/pulp-platform/Deeploy/issues/134).
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -276,17 +276,13 @@ def _tilingLoop(self, ctxt: NetworkContext, executionBlock: ExecutionBlock,
teardownStatements.append(CodeSnippet(self._lineComment, {"comment": "Deinitialize DMA future"}))
teardownStatements.extend(f.deinit() for f in ingressFutures | egressFutures)

metaInfo = TilingMetaInfo(
nodeName = operatorRepresentation['nodeName'] + f"_{self.externalMemory}",
nodeOps = operatorRepresentation['nodeOps'],
numTiles = operatorRepresentation['numTiles'],
totalNumTiles = len(tilingSchedule.outputLoadSchedule),
tileIdxPtr = operatorRepresentation['tileIdxPtr'],
tileIdxVar = "TILING_I",
# TODO: The kernelLevelTiling field is used in profiling to know we are generating code around the kernel.
# The current implementation does this by checking whether we are at the lowest memory level,
# which is hardcoded by the value "L1". Change this to be memory level agnostic.
kernelLevelTiling = self.localMemory == "L1")
metaInfo = TilingMetaInfo(nodeName = operatorRepresentation['nodeName'] + f"_{self.externalMemory}",
nodeOps = operatorRepresentation['nodeOps'],
numTiles = operatorRepresentation['numTiles'],
totalNumTiles = len(tilingSchedule.outputLoadSchedule),
tileIdxPtr = operatorRepresentation['tileIdxPtr'],
tileIdxVar = "TILING_I",
kernelLevelTiling = True)

executionBlock = self.generateAllTilingCode(executionBlock, metaInfo, ingressDMAStatements, egressDMAStatements,
openLoopStatements, closeLoopStatements, setupStatements,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -117,17 +117,13 @@ def _tilingLoop(self, ctxt: NetworkContext, executionBlock: ExecutionBlock,

closeLoopStatements = [CodeSnippet(self._closeTileLoopTemplate, {**operatorRepresentation})]

metaInfo = TilingMetaInfo(
nodeName = operatorRepresentation['nodeName'] + f"_{self.externalMemory}",
nodeOps = operatorRepresentation['nodeOps'],
numTiles = operatorRepresentation['numTiles'],
totalNumTiles = len(tilingSchedule.outputLoadSchedule),
tileIdxPtr = operatorRepresentation['tileIdxPtr'],
tileIdxVar = "TILING_I",
# TODO: The kernelLevelTiling field is used in profiling to know we are generating code around the kernel.
# The current implementation does this by checking whether we are at the lowest memory level,
# which is hardcoded by the value "L1". Change this to be memory level agnostic.
kernelLevelTiling = self.localMemory == "L1")
metaInfo = TilingMetaInfo(nodeName = operatorRepresentation['nodeName'] + f"_{self.externalMemory}",
nodeOps = operatorRepresentation['nodeOps'],
numTiles = operatorRepresentation['numTiles'],
totalNumTiles = len(tilingSchedule.outputLoadSchedule),
tileIdxPtr = operatorRepresentation['tileIdxPtr'],
tileIdxVar = "TILING_I",
kernelLevelTiling = True)

executionBlock = self.generateAllTilingCode(executionBlock, metaInfo, ingressDMAStatements, egressDMAStatements,
openLoopStatements, closeLoopStatements, setupStatements,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,14 +79,27 @@ class ProfilingPrototypeMixIn(ABC):

_printLoopSetup = NodeTemplate("""
StopTimer();
printf("===== Profiling ${nodeName} =====\\n");
for (int ${profileIdxVar} = ((*${tileIdxPtr} > 0) ? ${numTiles}[(*${tileIdxPtr} - 1)] : 0);
${profileIdxVar} < ${numTiles}[*${tileIdxPtr}];
${profileIdxVar}++){
""")

_printCycleDifference = NodeTemplate(r"""
printf("%s%u] %s%u%s", ${prefixStr}, ${profileIdxVar}, "${flavorStr}", \
${measurementsEnd}[${profileIdxVar}] - ${measurementsStart}[${profileIdxVar}], ${suffixStr});
_measurementDeclaration = NodeTemplate("""
uint32_t ${measurement} = ${measurementsEnd}[${profileIdxVar}] - ${measurementsStart}[${profileIdxVar}];
""")

_printCycleDifference = NodeTemplate("""
printf("%s%u] %s%6u%s", ${prefixStr}, ${profileIdxVar}, "${flavorStr}", \
${measurement}, ${suffixStr});
""")

_printCycleContribution = NodeTemplate("""
uint32_t total = ${measurementInput} + ${measurementKernel} + ${measurementOutput};
uint32_t dma = ${measurementInput} + ${measurementOutput};
float overhead_percentage = (total == 0) ? 0 : dma * 100.0f / total;
float kernel_percentage = (total == 0) ? 0 : ${measurementKernel} * 100.0f / total;
printf("%s%u] Total :%6u cycles (%2.1f%% Kernel + %2.1f%% Overhead, %u + %u)\\n", ${prefixStr}, ${profileIdxVar}, total, kernel_percentage, overhead_percentage , ${measurementKernel}, dma);
""")

_printLoopTeardown = NodeTemplate("""
Expand Down Expand Up @@ -151,13 +164,37 @@ def injectPrintCycleDiff(cls, executionBlock: ExecutionBlock, metaInfo: TilingMe
"tileIdxPtr": tileIdxPtr,
})

executionBlock.addRight(
cls._measurementDeclaration, {
"measurement": f"{nodeName}_ingress_dma_wait_measurement",
"measurementsStart": f"{nodeName}_ingress_dma_wait_start_measurements",
"measurementsEnd": f"{nodeName}_ingress_dma_wait_end_measurements",
"profileIdxVar": profileIdxVar,
})

if metaInfo.kernelLevelTiling:
executionBlock.addRight(
cls._measurementDeclaration, {
"measurement": f"{nodeName}_kernel_measurement",
"measurementsStart": f"{nodeName}_kernel_start_measurements",
"measurementsEnd": f"{nodeName}_kernel_end_measurements",
"profileIdxVar": profileIdxVar,
})

executionBlock.addRight(
cls._measurementDeclaration, {
"measurement": f"{nodeName}_egress_dma_wait_measurement",
"measurementsStart": f"{nodeName}_egress_dma_wait_start_measurements",
"measurementsEnd": f"{nodeName}_egress_dma_wait_end_measurements",
"profileIdxVar": profileIdxVar,
})

executionBlock.addRight(
cls._printCycleDifference, {
"prefixStr": f"{nodeName}_prefix",
"suffixStr": f"{nodeName}_suffix",
"flavorStr": "Input DMA took ",
"measurementsStart": f"{nodeName}_ingress_dma_wait_start_measurements",
"measurementsEnd": f"{nodeName}_ingress_dma_wait_end_measurements",
"flavorStr": "Pre-Kernel :",
"measurement": f"{nodeName}_ingress_dma_wait_measurement",
"profileIdxVar": profileIdxVar,
})

Expand All @@ -166,22 +203,32 @@ def injectPrintCycleDiff(cls, executionBlock: ExecutionBlock, metaInfo: TilingMe
cls._printCycleDifference, {
"prefixStr": f"{nodeName}_prefix",
"suffixStr": f"{nodeName}_suffix",
"flavorStr": "Kernel took ",
"measurementsStart": f"{nodeName}_kernel_start_measurements",
"measurementsEnd": f"{nodeName}_kernel_end_measurements",
"flavorStr": "Kernel :",
"measurement": f"{nodeName}_kernel_measurement",
"profileIdxVar": profileIdxVar,
})

executionBlock.addRight(
cls._printCycleDifference, {
"prefixStr": f"{nodeName}_prefix",
"suffixStr": f"{nodeName}_suffix",
"flavorStr": "Output DMA took ",
"measurementsStart": f"{nodeName}_egress_dma_wait_start_measurements",
"measurementsEnd": f"{nodeName}_egress_dma_wait_end_measurements",
"flavorStr": "Post-Kernel:",
"measurement": f"{nodeName}_egress_dma_wait_measurement",
"profileIdxVar": profileIdxVar,
})

# Total Time: Input + Kernel + Output
# Overhead: (Input + Output) / Total
if metaInfo.kernelLevelTiling:
executionBlock.addRight(
cls._printCycleContribution, {
"prefixStr": f"{nodeName}_prefix",
"measurementInput": f"{nodeName}_ingress_dma_wait_measurement",
"measurementKernel": f"{nodeName}_kernel_measurement",
"measurementOutput": f"{nodeName}_egress_dma_wait_measurement",
"profileIdxVar": profileIdxVar,
})

executionBlock.addRight(cls._printLoopTeardown, {})

return executionBlock
Expand Down
Loading