PyTorch version upgrade: tested on single-operator tests

wok1909 · wok1909 · commit ad13519d942e · 2025-09-24T12:59:04.000Z
diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py
@@ -3,7 +3,8 @@
 import shlex
 import subprocess
 
-from torch._inductor.codecache import AsyncCompile, get_lock_dir, get_hash, write
+from torch._inductor.codecache import get_lock_dir, get_hash, write
+from torch._inductor.async_compile import AsyncCompile
 from AsmParser.tog_generator import tog_generator
 from PyTorchSimFrontend.mlir.mlir_caller_codegen import MLIRKernelCallerCodeGen
 from PyTorchSimFrontend import extension_config
diff --git a/PyTorchSimFrontend/extension_device.cpp b/PyTorchSimFrontend/extension_device.cpp
@@ -112,7 +112,7 @@ at::Tensor custom_to_device(
 // A dummy allocator for our custom device, that secretly uses the CPU
 struct DummyCustomAllocator final : at::Allocator {
   DummyCustomAllocator() = default;
-  at::DataPtr allocate(size_t nbytes) const override {
+  at::DataPtr allocate(size_t nbytes) override {
     void* data = c10::alloc_cpu(nbytes);
     return {data, data, &ReportAndDelete, at::Device(at::DeviceType::PrivateUse1, 0)};
   }
@@ -127,6 +127,10 @@ struct DummyCustomAllocator final : at::Allocator {
   at::DeleterFnPtr raw_deleter() const override {
     return &ReportAndDelete;
   }
+
+  void copy_data(void* dest, const void* src, std::size_t count) const override {
+    std::memcpy(dest, src, count);
+  }
 };
 
 // Register our dummy allocator
diff --git a/PyTorchSimFrontend/extension_device_interface.py b/PyTorchSimFrontend/extension_device_interface.py
@@ -0,0 +1,63 @@
+import torch
+from torch._dynamo.device_interface import DeviceInterface, caching_worker_current_devices, caching_worker_device_properties
+
+class _ExtensionDeviceProperties:   # FIXME: Dummy property values
+    name: str = "Extension_device"
+    platform_name: str
+    vendor: str
+    driver_version: str
+    version: str
+    max_compute_units: int
+    gpu_eu_count: int
+    max_work_group_size: int
+    max_num_sub_groups: int
+    sub_group_sizes: list[int]
+    has_fp16: bool
+    has_fp64: bool
+    has_atomic64: bool
+    has_bfloat16_conversions: bool
+    has_subgroup_matrix_multiply_accumulate: bool
+    has_subgroup_matrix_multiply_accumulate_tensor_float32: bool
+    has_subgroup_2d_block_io: bool
+    total_memory: int
+    multi_processor_count: int = 128     # gpu_subslice_count, num_sm
+    architecture: int
+    type: str
+
+_ExtensionDeviceProperties = _ExtensionDeviceProperties
+
+class ExtensionDeviceInterface(DeviceInterface):
+    class Worker:
+        @staticmethod
+        def set_device(device: int):
+            caching_worker_current_devices["extension_device"] = device
+
+        @staticmethod
+        def current_device() -> int:
+            if "extension_device" in caching_worker_current_devices:
+                return caching_worker_current_devices["extension_device"]
+            return torch.xpu.current_device()
+
+        @staticmethod
+        def get_device_properties(device: torch.types.Device = None) -> _ExtensionDeviceProperties:
+            if device is not None:
+                if isinstance(device, str):
+                    device = torch.device(device)
+                    assert device.type == "extension_device"
+                if isinstance(device, torch.device):
+                    device = device.index
+            if device is None:
+                device = ExtensionDeviceInterface.Worker.current_device()
+
+            if "extension_device" not in caching_worker_device_properties:
+                device_prop = [
+                    torch.cuda.get_device_properties(i)
+                    for i in range(torch.cuda.device_count())
+                ]
+                caching_worker_device_properties["extension_device"] = device_prop
+
+            return _ExtensionDeviceProperties
+
+    @staticmethod
+    def get_compute_capability(device: torch.types.Device = None):
+        return 36
diff --git a/PyTorchSimFrontend/extension_device_op_overrides.py b/PyTorchSimFrontend/extension_device_op_overrides.py
@@ -0,0 +1,25 @@
+from __future__ import annotations
+
+from textwrap import dedent
+
+from torch._inductor.codegen.common import DeviceOpOverrides, register_device_op_overrides
+
+class ExtensionDeviceOpOverrides(DeviceOpOverrides):
+    def import_get_raw_stream_as(self, name: str) -> str:
+        return dedent(
+            """
+            def get_raw_stream(_):
+                return 0
+            """
+        )
+
+    def set_device(self, device_idx: int) -> str:
+        return "pass"
+
+    def synchronize(self) -> str:
+        return "pass"
+
+    def device_guard(self, device_idx: int) -> str:
+        return "pass"
+
+register_device_op_overrides("extension_device", ExtensionDeviceOpOverrides())
diff --git a/PyTorchSimFrontend/extension_utils.py b/PyTorchSimFrontend/extension_utils.py
@@ -0,0 +1,26 @@
+import sympy
+import torch
+
+"""
+NOTE: Temporary File
+
+This file contains functions that were removed or changed in newer versions
+of PyTorch. It is kept here only to temporarily enable compatibility while
+upgrading to PyTorch 2.8 from PyTorch 2.2.
+
+These functions will eventually be integrated into the appropriate source files
+or removed once no longer needed.
+
+This file is not intended to be permanent and should be deleted in the future.
+"""
+
+def free_symbol_startswith(index: sympy.Expr, prefix: str):
+    return any(v.name.startswith(prefix) for v in index.free_symbols)
+
+def sympy_symbol(name: str) -> sympy.Symbol:
+    # This should never be used for creating shape/stride symbols, as those
+    # should all be allocated before Inductor.
+    assert name[0] != "s"
+    # NOTE: shape symbols are positive (> 0), but index variables are only
+    # non-negative (>= 0).
+    return sympy.Symbol(name, integer=True, nonnegative=True)
diff --git a/PyTorchSimFrontend/llvm/llvm_common.py b/PyTorchSimFrontend/llvm/llvm_common.py
@@ -11,13 +11,14 @@
 from torch.utils._sympy.value_ranges import ValueRanges
 
 from torch._inductor.utils import (
-    free_symbol_startswith,
     get_sympy_Expr_dtype,
     IndentedBuffer,
     sympy_subs,
     unique,
 )
 
+from PyTorchSimFrontend.extension_utils import free_symbol_startswith
+
 schedule_log = torch._logging.getArtifactLogger(__name__, "schedule")
 
 DTYPE_TO_LLVM = {
diff --git a/PyTorchSimFrontend/mlir/mlir_autotune.py b/PyTorchSimFrontend/mlir/mlir_autotune.py
@@ -41,6 +41,9 @@ def __init__(
         self.extra_args = extra_args
         #self.hash_key, self.source_file = CUDACodeCache.write(self.source_code, "so")
 
+    def __str__(self) -> str:
+        return f"{self.kernel_name=}, {self.source_file=}, {self.hash_key=}"
+
     def make_run_fn(
         self, input_tensors: torch.Tensor, output_tensors: torch.Tensor
     ) -> Callable[[], None]:
@@ -62,5 +65,6 @@ def make_run_fn(
             *args,
         )
 
-    def __str__(self) -> str:
-        return f"{self.kernel_name=}, {self.source_file=}, {self.hash_key=}"
+    def update_workspace_size(self) -> None:
+        # FIXME: Not implemented yet. Checkout torch/_inductor/codegen/rocm/rocm_benchmark_request.py
+        return
diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -4,10 +4,12 @@
 import os
 import math
 import torch
+from typing import Optional
 from collections import defaultdict
 from concurrent.futures import ThreadPoolExecutor
 from torch._dynamo.utils import dynamo_timed
 from torch._inductor.codegen import cpp, wrapper, common, memory_planning
+from torch._inductor.ir import GraphPartitionSignature
 from torch._inductor.virtualized import V, _ops as ops
 from torch._inductor.codecache import write_atomic, write
 from torch._inductor.utils import (
@@ -75,10 +77,25 @@ def reduction_combine_vec(reduction_type, vector_value, init_value, axis, shape,
         return f"vector.multi_reduction <and>, %{vector_value}, %{init_value} [{axis}] : {shape} to {reduced_shape}"
     raise AssertionError(reduction_type)
 
-class ExtensionWrapperCodegen(wrapper.WrapperCodeGen):
+class ExtensionWrapperCodegen(wrapper.PythonWrapperCodegen):
     def __init__(self):
         super().__init__()
 
+    @classmethod
+    def create(
+        cls,
+        is_subgraph: bool,
+        subgraph_name: Optional[str],
+        parent_wrapper: Optional[wrapper.PythonWrapperCodegen],
+        partition_signatures: Optional[GraphPartitionSignature] = None,
+    ):
+        if is_subgraph:
+            assert subgraph_name is not None and parent_wrapper is not None
+            return wrapper.SubgraphPythonWrapperCodegen(
+                subgraph_name, parent_wrapper, partition_signatures
+            )
+        return cls()
+
     def write_header(self):
         self.header.splice(
             f"""
@@ -107,6 +124,7 @@ def write_header(self):
                 reinterpret_tensor = torch.ops.aten._reinterpret_tensor
                 custom_async_compile = CustomAsyncCompile()
                 os.environ["TORCHSIM_LAST_COMPILED_MODULE"] = __file__
+                print(f\'Wrapper Codegen Path = {{__file__}}\')
             """
         )
         self.header.splice(
@@ -154,7 +172,7 @@ def call(args):
                 self.prefix.writeline(f"{lhs} = args")
                 self.prefix.writeline("args.clear()")
 
-            self.codegen_inputs(self.prefix, V.graph.graph_inputs)
+            self.codegen_inputs()
             self.codegen_input_size_asserts()
             self.codegen_sram_plan_prefix()
 
@@ -174,10 +192,27 @@ def codegen_sram_plan_postfix(self, outputs):
                 continue
             self.wrapper_call.writeline(f"sram_plan_postfix('{name}', {name})")
 
-    @dynamo_timed
+    def _generate_kernel_call_helper(
+        self,
+        kernel_name: str,
+        call_args,
+        *,
+        device=None,
+        triton=True,
+        arg_types=None,
+        raw_keys=None,
+        raw_args=None,
+        triton_meta=None,
+        graph_name="",
+        original_fxnode_name=None,
+    ):
+        device = device or V.graph.get_current_device_or_throw()
+        self.writeline(self.wrap_kernel_call(kernel_name, call_args))
+        return
+
     def generate(self, is_inference):
         result = IndentedBuffer()
-        result.splice(self.header)
+        # result.splice(self.header)
 
         with contextlib.ExitStack() as stack:
             stack.enter_context(self.wrapper_call.indent())
@@ -192,8 +227,13 @@ def generate(self, is_inference):
 
                 if isinstance(line, wrapper.MemoryPlanningLine):
                     line.codegen(self.wrapper_call)
+                elif isinstance(line, wrapper.KernelCallLine):
+                    self.wrapper_call.writeline(self.wrap_kernel_call(line.kernel_name, line.call_args))
                 else:
-                    self.wrapper_call.writeline(line)
+                    if isinstance(line, wrapper.WrapperLine):
+                        line.codegen(self.wrapper_call)
+                    else:
+                        self.wrapper_call.writeline(line)
                 # Add buffer plan hook for alloc
                 if isinstance(line, memory_planning.AllocFromPoolLine) or isinstance(line, wrapper.AllocateLine):
                     self.wrapper_call.writeline(f"sram_plan_prefix('{line.node.get_name()}', {line.node.get_name()})")
@@ -202,7 +242,9 @@ def generate(self, is_inference):
             self.mark_output_type()
             self.generate_return(output_refs)
 
-        self.append_precomputed_sizes_to_prefix()
+        # self.append_precomputed_sizes_to_prefix() # FIXME: Need to replace append_precomputed_sizes_to_prefix()
+        result.splice(self.header)
+
         self.finalize_prefix()
         result.splice(self.prefix)
 
@@ -211,7 +253,10 @@ def generate(self, is_inference):
 
         self.generate_end(result)
         self.add_benchmark_harness(result)
-        return result.getvaluewithlinemap()
+        return (
+            result.getvaluewithlinemap(),
+            self.kernel_declarations.getvaluewithlinemap(),
+        )
 
     def memory_plan(self):
         self.lines = memory_planning.MemoryPlanner(self).plan(self.lines)
@@ -1494,16 +1539,16 @@ def get_cycle(choice):
         return optimal_src_code
 
     def codegen_nodes(self, nodes, kernel_name):
-        src_code = super().codegen_nodes(nodes, kernel_name)
+        src_code, meta_code = super().codegen_nodes(nodes, kernel_name)
         self._prepare_simulator_headers(src_code)
         if not extension_config.CONFIG_AUTOTUNE or extension_config.CONFIG_BACKENDSIM_SPIKE_ONLY:
-            return src_code
+            return src_code, meta_code
         else:
             optimal_src_code = self.autotune(nodes, kernel_name)
             if optimal_src_code:
-                return optimal_src_code
+                return optimal_src_code, meta_code
             else:
-                return src_code
+                return src_code, meta_code
 
     def _prepare_simulator_headers(self, src_code):
         write_path = extension_codecache.get_write_path(src_code)
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
diff --git a/Scheduler/scheduler.py b/Scheduler/scheduler.py