NVIDIA · zasdfgbnm · Nov 19, 2025 · Nov 19, 2025 · Nov 19, 2025 · Nov 19, 2025
diff --git a/README.md b/README.md
@@ -1,75 +1,44 @@
-<!--
- * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
- * All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
--->
-
-# Fuser
-
-A Fusion Code Generator for NVIDIA GPUs (commonly known as "nvFuser")
-
-## Installation
-
-We publish nightly wheel packages on https://pypi.nvidia.com, while build against stable torch version on https://pypi.org and https://pypi.nvidia.com.
-**Wheels are published for Python version: _3.10_, _3.12_**.
-
-Note that nvfuser built against stable torch version isn't compatible with nightly pytorch wheel, so ensure you pick the right version suiting your environment.
-
-
-### Nightly nvfuser pip wheel
-
-You can install a nightly nvfuser pip package built against torch nightly code base with
-`pip install --pre nvfuser-cu128 --extra-index-url https://pypi.nvidia.com`
-
-As we build against nightly torch wheel and there's no compatibility promised on nightly wheels,
-we have explicitly marked the nightly torch wheel as an optional dependency.
-You can choose to install the torch wheel along with nvfuser package,
-e.g.  `pip install --pre "nvfuser-cu128[torch]" --extra-index-url https://pypi.nvidia.com`.
-Note that this may uninstall your local pytorch installation and install the compatible nightly pytorch.
-
-### Nvfuser pip wheel against pytorch stable release
-
-Nvfuser pip wheel built against stable torch releases is published on both https://pypi.org and https://pypi.nvidia.com.
-Pick the right cuda toolkit version to match your torch installation. e.g. `pip install nvfuser-cu128-torch27`.
-
-For old nvfuser builds against old version pytorch, e.g. `nvfuser-cuXXY-torchZW`,
-there are packages available at [PyPI](https://pypi.org/search/?q=nvfuser).
-We build and publish builds against the latest stable pytorch on https://pypi.org on 1st and 15th of every month regularly and
-when major changes are added.
-
-We always recommend use of the latest nvfuser build with latest cuda and pytorch versions.
-
-PyPI: [https://pypi.org/project/nvfuser/](https://pypi.org/search/?q=nvfuser)
-
-## Developer
-
-Docs: https://github.com/NVIDIA/Fuser/wiki
-
-### Install From Source:
-```bash
-git clone https://github.com/NVIDIA/Fuser.git
-cd Fuser
-pip install -r python/requirements.txt
-
-[DEPRECATED] `[MAX_JOBS] python setup.py develop [args]`
-pip install --no-build-isolation -e python -v
-```
-
-Supported compilers:
-
-**GCC:**
-
-We support all "supported releases" of gcc as specified in [the official site](https://gcc.gnu.org/).
-As of 3/2/2025, they are:
-
-- gcc 12.4
-- gcc 13.3
-- gcc 14.2
-
-**Clang:**
-
-- clang 16+
-
-Supported C++ standard:
-
-- C++20
+self.test_addcmul()
+self.test_alias_output_to_input()
+self.test_all_dim_var_mean()
+self.test_allocation_domain_concretization()
+self.test_allocation_domain_index_select()
+self.test_arithmetic_ops()
+self.test_basic()
+self.test_basic_fp16()
+self.test_bcast_squeeze_replace_aliased_output()
+self.test_broadcast_and_stride_order()
+self.test_broadcast_in_dim_with_dynamic_shapes()
+self.test_broadcast_mixing()
+self.test_cast_double_to_half()
+self.test_cat()
+self.test_cat_symbolic()
+self.test_complex_constants()
+self.test_complex_rsqrt()
+self.test_compute_contiguity()
+self.test_compute_tensor_descriptor()
+self.test_constant_nans()
+self.test_cuda_code_and_scheduled_fusion_ir_strings()
+self.test_cumsum()
+self.test_cumsum_int()
+self.test_debug_output()
+self.test_def_op_in_schedule()
+self.test_deterministic_random()
+self.test_dynamic_reshape()
+self.test_empty_reshape()
+self.test_enable_disable_options()
+self.test_execute_with_tuple_and_list()
+self.test_expand()
+self.test_expand_to_zero()
+self.test_expanded_bcast_tensor()
+self.test_expanded_reduction()
+self.test_explicit_broadcast_input()
+self.test_fix_2549()
+self.test_from_pytorch_fails_on_cpu_tensor()
+self.test_func_definition()
+self.test_fusion_definition_error_cache()
+self.test_fusion_information()
+self.test_fusion_profiler()
+self.test_fusion_profiler_user_schedule()
+self.test_fusion_profiler_with_noncodegen_kernels()
+self.test_gather
diff --git a/csrc/ir/nodes.cpp b/csrc/ir/nodes.cpp
@@ -4876,8 +4876,30 @@ std::string LinearOp::toInlineString(int indent_size) const {
 std::vector<PolymorphicValue> LinearOp::evaluate(
     const ExpressionEvaluator& ee,
     const std::vector<PolymorphicValue>& inputs) const {
+  std::cout << "[DEBUG] LinearOp::evaluate - ENTRY" << std::endl;
+  std::cout.flush();
+
+  std::cout << "[DEBUG] LinearOp::evaluate - STEP 1: Getting input tensor (inputs.size=" << inputs.size() << ")" << std::endl;
+  std::cout.flush();
   const auto in = inputs.at(0).as<at::Tensor>();
+  std::cout << "[DEBUG] LinearOp::evaluate - STEP 1a: Input tensor obtained, shape=[";
+  for (int64_t i = 0; i < in.dim(); i++) {
+    if (i > 0) std::cout << ", ";
+    std::cout << in.size(i);
+  }
+  std::cout << "], device=" << in.device() << std::endl;
+  std::cout.flush();
+
+  std::cout << "[DEBUG] LinearOp::evaluate - STEP 2: Getting weight tensor" << std::endl;
+  std::cout.flush();
   auto weight = inputs.at(1).as<at::Tensor>();
+  std::cout << "[DEBUG] LinearOp::evaluate - STEP 2a: Weight tensor obtained, shape=[";
+  for (int64_t i = 0; i < weight.dim(); i++) {
+    if (i > 0) std::cout << ", ";
+    std::cout << weight.size(i);
+  }
+  std::cout << "], device=" << weight.device() << std::endl;
+  std::cout.flush();
 
   auto squeeze_device_dims = [](at::Tensor& t,
                                 int64_t num_device_dims) -> void {
@@ -4896,28 +4918,190 @@ std::vector<PolymorphicValue> LinearOp::evaluate(
 
   // The squeezes and unsqueezes are currently required to support a sharded
   // linear layer. Remove them after #2563.
+  std::cout << "[DEBUG] LinearOp::evaluate - STEP 3: Calculating num_device_dims (weight.dim=" << weight.dim() << ")" << std::endl;
+  std::cout.flush();
   auto num_device_dims = weight.dim() - 2;
+  std::cout << "[DEBUG] LinearOp::evaluate - STEP 3a: num_device_dims=" << num_device_dims << std::endl;
+  std::cout.flush();
+
+  std::cout << "[DEBUG] LinearOp::evaluate - STEP 4: Squeezing device dims from weight" << std::endl;
+  std::cout.flush();
   squeeze_device_dims(weight, num_device_dims);
+  std::cout << "[DEBUG] LinearOp::evaluate - STEP 4a: Weight squeezed, new shape=[";
+  for (int64_t i = 0; i < weight.dim(); i++) {
+    if (i > 0) std::cout << ", ";
+    std::cout << weight.size(i);
+  }
+  std::cout << "]" << std::endl;
+  std::cout.flush();
 
+  std::cout << "[DEBUG] LinearOp::evaluate - STEP 5: Checking hasBias (hasBias=" << hasBias() << ")" << std::endl;
+  std::cout.flush();
   at::Tensor out_tensor;
   if (hasBias()) {
+    std::cout << "[DEBUG] LinearOp::evaluate - STEP 5a: Getting bias tensor" << std::endl;
+    std::cout.flush();
     auto bias = inputs.at(2).as<at::Tensor>();
+    std::cout << "[DEBUG] LinearOp::evaluate - STEP 5b: Bias tensor obtained, shape=[";
+    for (int64_t i = 0; i < bias.dim(); i++) {
+      if (i > 0) std::cout << ", ";
+      std::cout << bias.size(i);
+    }
+    std::cout << "], device=" << bias.device() << std::endl;
+    std::cout.flush();
+
+    std::cout << "[DEBUG] LinearOp::evaluate - STEP 5c: Squeezing device dims from bias" << std::endl;
+    std::cout.flush();
     squeeze_device_dims(bias, num_device_dims);
+    std::cout << "[DEBUG] LinearOp::evaluate - STEP 5d: Bias squeezed, new shape=[";
+    for (int64_t i = 0; i < bias.dim(); i++) {
+      if (i > 0) std::cout << ", ";
+      std::cout << bias.size(i);
+    }
+    std::cout << "]" << std::endl;
+    std::cout.flush();
+
+    std::cout << "[DEBUG] LinearOp::evaluate - STEP 5e: Calling at::linear with bias" << std::endl;
+    std::cout << "[DEBUG] LinearOp::evaluate - INPUT METADATA:" << std::endl;
+    std::cout << "  in.sizes: [";
+    for (int64_t i = 0; i < in.dim(); i++) {
+      if (i > 0) std::cout << ", ";
+      std::cout << in.size(i);
+    }
+    std::cout << "]" << std::endl;
+    std::cout << "  in.strides: [";
+    for (int64_t i = 0; i < in.dim(); i++) {
+      if (i > 0) std::cout << ", ";
+      std::cout << in.stride(i);
+    }
+    std::cout << "]" << std::endl;
+    std::cout << "  in.dtype: " << in.dtype() << std::endl;
+    std::cout << "  in.device: " << in.device() << std::endl;
+    std::cout << "  in.is_contiguous: " << in.is_contiguous() << std::endl;
+    std::cout << "  in.numel: " << in.numel() << std::endl;
+    std::cout << "[DEBUG] LinearOp::evaluate - WEIGHT METADATA:" << std::endl;
+    std::cout << "  weight.sizes: [";
+    for (int64_t i = 0; i < weight.dim(); i++) {
+      if (i > 0) std::cout << ", ";
+      std::cout << weight.size(i);
+    }
+    std::cout << "]" << std::endl;
+    std::cout << "  weight.strides: [";
+    for (int64_t i = 0; i < weight.dim(); i++) {
+      if (i > 0) std::cout << ", ";
+      std::cout << weight.stride(i);
+    }
+    std::cout << "]" << std::endl;
+    std::cout << "  weight.dtype: " << weight.dtype() << std::endl;
+    std::cout << "  weight.device: " << weight.device() << std::endl;
+    std::cout << "  weight.is_contiguous: " << weight.is_contiguous() << std::endl;
+    std::cout << "  weight.numel: " << weight.numel() << std::endl;
+    std::cout << "[DEBUG] LinearOp::evaluate - BIAS METADATA:" << std::endl;
+    std::cout << "  bias.sizes: [";
+    for (int64_t i = 0; i < bias.dim(); i++) {
+      if (i > 0) std::cout << ", ";
+      std::cout << bias.size(i);
+    }
+    std::cout << "]" << std::endl;
+    std::cout << "  bias.strides: [";
+    for (int64_t i = 0; i < bias.dim(); i++) {
+      if (i > 0) std::cout << ", ";
+      std::cout << bias.stride(i);
+    }
+    std::cout << "]" << std::endl;
+    std::cout << "  bias.dtype: " << bias.dtype() << std::endl;
+    std::cout << "  bias.device: " << bias.device() << std::endl;
+    std::cout << "  bias.is_contiguous: " << bias.is_contiguous() << std::endl;
+    std::cout << "  bias.numel: " << bias.numel() << std::endl;
+    std::cout.flush();
     out_tensor = at::linear(in, weight, bias);
+    std::cout << "[DEBUG] LinearOp::evaluate - STEP 5f: at::linear completed" << std::endl;
+    std::cout.flush();
   } else {
+    std::cout << "[DEBUG] LinearOp::evaluate - STEP 5g: Calling at::linear without bias" << std::endl;
+    std::cout << "[DEBUG] LinearOp::evaluate - INPUT METADATA:" << std::endl;
+    std::cout << "  in.sizes: [";
+    for (int64_t i = 0; i < in.dim(); i++) {
+      if (i > 0) std::cout << ", ";
+      std::cout << in.size(i);
+    }
+    std::cout << "]" << std::endl;
+    std::cout << "  in.strides: [";
+    for (int64_t i = 0; i < in.dim(); i++) {
+      if (i > 0) std::cout << ", ";
+      std::cout << in.stride(i);
+    }
+    std::cout << "]" << std::endl;
+    std::cout << "  in.dtype: " << in.dtype() << std::endl;
+    std::cout << "  in.device: " << in.device() << std::endl;
+    std::cout << "  in.is_contiguous: " << in.is_contiguous() << std::endl;
+    std::cout << "  in.numel: " << in.numel() << std::endl;
+    std::cout << "[DEBUG] LinearOp::evaluate - WEIGHT METADATA:" << std::endl;
+    std::cout << "  weight.sizes: [";
+    for (int64_t i = 0; i < weight.dim(); i++) {
+      if (i > 0) std::cout << ", ";
+      std::cout << weight.size(i);
+    }
+    std::cout << "]" << std::endl;
+    std::cout << "  weight.strides: [";
+    for (int64_t i = 0; i < weight.dim(); i++) {
+      if (i > 0) std::cout << ", ";
+      std::cout << weight.stride(i);
+    }
+    std::cout << "]" << std::endl;
+    std::cout << "  weight.dtype: " << weight.dtype() << std::endl;
+    std::cout << "  weight.device: " << weight.device() << std::endl;
+    std::cout << "  weight.is_contiguous: " << weight.is_contiguous() << std::endl;
+    std::cout << "  weight.numel: " << weight.numel() << std::endl;
+    std::cout.flush();
     out_tensor = at::linear(in, weight);
+    std::cout << "[DEBUG] LinearOp::evaluate - STEP 5h: at::linear completed" << std::endl;
+    std::cout.flush();
   }
+
+  std::cout << "[DEBUG] LinearOp::evaluate - STEP 6: at::linear result shape=[";
+  for (int64_t i = 0; i < out_tensor.dim(); i++) {
+    if (i > 0) std::cout << ", ";
+    std::cout << out_tensor.size(i);
+  }
+  std::cout << "], device=" << out_tensor.device() << std::endl;
+  std::cout.flush();
 
+  std::cout << "[DEBUG] LinearOp::evaluate - STEP 7: Unsqueezing output (num_device_dims=" << num_device_dims << ")" << std::endl;
+  std::cout.flush();
   for ([[maybe_unused]] auto _ : arange(num_device_dims)) {
     out_tensor = out_tensor.unsqueeze(0);
   }
+  std::cout << "[DEBUG] LinearOp::evaluate - STEP 7a: Unsqueezed output shape=[";
+  for (int64_t i = 0; i < out_tensor.dim(); i++) {
+    if (i > 0) std::cout << ", ";
+    std::cout << out_tensor.size(i);
+  }
+  std::cout << "]" << std::endl;
+  std::cout.flush();
 
   // Handle rFactor DIDs similar to MatmulOp::evaluate.
+  std::cout << "[DEBUG] LinearOp::evaluate - STEP 8: Checking rFactor device dimension index" << std::endl;
+  std::cout.flush();
   if (const auto rfactor_did_idx = getRFactorDeviceDimensionIndex(out());
       rfactor_did_idx != -1) {
+    std::cout << "[DEBUG] LinearOp::evaluate - STEP 8a: rFactor DID index=" << rfactor_did_idx << ", unsqueezing" << std::endl;
+    std::cout.flush();
     out_tensor = out_tensor.unsqueeze(rfactor_did_idx);
+    std::cout << "[DEBUG] LinearOp::evaluate - STEP 8b: Final output shape=[";
+    for (int64_t i = 0; i < out_tensor.dim(); i++) {
+      if (i > 0) std::cout << ", ";
+      std::cout << out_tensor.size(i);
+    }
+    std::cout << "]" << std::endl;
+    std::cout.flush();
+  } else {
+    std::cout << "[DEBUG] LinearOp::evaluate - STEP 8c: No rFactor DID, skipping unsqueeze" << std::endl;
+    std::cout.flush();
   }
 
+  std::cout << "[DEBUG] LinearOp::evaluate - STEP 9: Returning result" << std::endl;
+  std::cout.flush();
   return {out_tensor};
 }