Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ if(WITH_CAMBRICON)
endif()

# If all other platforms are not enabled, CPU is enabled by default.
if(NOT WITH_NVIDIA AND NOT WITH_ILUVATAR AND NOT WITH_METAX)
if(NOT WITH_NVIDIA AND NOT WITH_ILUVATAR AND NOT WITH_METAX AND NOT WITH_CAMBRICON)
add_compile_definitions(WITH_CPU=1)
endif()

Expand Down
13 changes: 7 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,13 @@ make -j$(nproc)

For the `<OPTIONS>`:

| Option | Functionality | Default
|----------------------------------------|------------------------------------|:-:
| `-DWITH_CPU=[ON\|OFF]` | Compile the CPU implementation | n
| `-DWITH_NVIDIA=[ON\|OFF]` | Compile the NVIDIA implementation | n
| `-DWITH_METAX=[ON\|OFF]` | Compile the MetaX implementation | n
| `-DGENERATE_PYTHON_BINDINGS=[ON\|OFF]` | Generate Python bindings | n
| Option | Functionality | Default
|----------------------------------------|-------------------------------------|:-:
| `-DWITH_CPU=[ON\|OFF]` | Compile the CPU implementation | n
| `-DWITH_NVIDIA=[ON\|OFF]` | Compile the NVIDIA implementation | n
| `-DWITH_METAX=[ON\|OFF]` | Compile the MetaX implementation | n
| `-DWITH_CAMBRICON=[ON\|OFF]` | Compile the Cambricon implementation| n
| `-DGENERATE_PYTHON_BINDINGS=[ON\|OFF]` | Generate Python bindings | n

*Note: If no accelerator options are provided, `WITH_CPU` is enabled by default.*

Expand Down
80 changes: 38 additions & 42 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,27 +1,19 @@
add_library(infiniops SHARED)

file(GLOB BASE_SRCS CONFIGURE_DEPENDS "*.cc")
target_sources(infiniops PRIVATE ${BASE_SRCS})

set(DEVICE_LIST "")

if(WITH_CPU)
set(CPU_PATTERNS
"cpu/*.cc"
"cpu/*.cpp"
)

file(GLOB_RECURSE CPU_SOURCES CONFIGURE_DEPENDS ${CPU_PATTERNS})
list(APPEND CORE_SOURCES ${CPU_SOURCES})

target_compile_definitions(infiniops PUBLIC WITH_CPU=1)

find_package(OpenMP REQUIRED)
target_link_libraries(infiniops PRIVATE OpenMP::OpenMP_CXX)

list(APPEND DEVICE_LIST "cpu")
endif()

if(WITH_NVIDIA)
set(NVIDIA_PATTERNS
"cuda/*.cc"
Expand All @@ -31,24 +23,18 @@ if(WITH_NVIDIA)
"nvidia/*.cpp"
"nvidia/*.cu"
)

file(GLOB_RECURSE NVIDIA_SOURCES CONFIGURE_DEPENDS ${NVIDIA_PATTERNS})

enable_language(CUDA)

target_compile_definitions(infiniops PUBLIC WITH_NVIDIA=1)
target_sources(infiniops PRIVATE ${NVIDIA_SOURCES})

find_package(CUDAToolkit REQUIRED)
target_link_libraries(infiniops PUBLIC CUDA::cudart CUDA::cublas CUDA::cuda_driver)

list(APPEND DEVICE_LIST "nvidia")
set_target_properties(infiniops PROPERTIES
CUDA_STANDARD 17
CUDA_STANDARD_REQUIRED ON
)
endif()

if(WITH_ILUVATAR)
set(ILUVATAR_PATTERNS
"cuda/*.cc"
Expand All @@ -58,99 +44,109 @@ if(WITH_ILUVATAR)
"iluvatar/*.cpp"
"iluvatar/*.cu"
)

file(GLOB_RECURSE ILUVATAR_SOURCES CONFIGURE_DEPENDS ${ILUVATAR_PATTERNS})

enable_language(CUDA)

target_compile_definitions(infiniops PUBLIC WITH_ILUVATAR=1)
target_sources(infiniops PRIVATE ${ILUVATAR_SOURCES})

find_package(CUDAToolkit REQUIRED)
target_link_libraries(infiniops PUBLIC CUDA::cudart CUDA::cublas CUDA::cuda_driver)

set_target_properties(infiniops PROPERTIES
CUDA_STANDARD 17
CUDA_STANDARD_REQUIRED ON
)

list(APPEND DEVICE_LIST "iluvatar")
endif()

if(WITH_METAX)
set(METAX_PATTERNS
"cuda/*.cc"
"cuda/*.cpp"
"metax/*.cc"
"metax/*.maca"
)

file(GLOB_RECURSE METAX_SOURCES CONFIGURE_DEPENDS ${METAX_PATTERNS})

set_source_files_properties(${METAX_SOURCES} PROPERTIES LANGUAGE CXX)

target_compile_definitions(infiniops PRIVATE WITH_METAX=1)
target_compile_options(infiniops PUBLIC "-x" "maca")
target_sources(infiniops PRIVATE ${METAX_SOURCES})

target_include_directories(infiniops PUBLIC "${MACA_PATH}/include")
target_link_libraries(infiniops PUBLIC
${MACA_RUNTIME_LIB}
${MACA_DNN_LIB}
${MACA_BLAS_LIB}
)

list(APPEND DEVICE_LIST "metax")
endif()

if(WITH_CAMBRICON)
target_compile_definitions(infiniops PUBLIC WITH_CAMBRICON=1)

file(GLOB_RECURSE CAMBRICON_MLU_SOURCES CONFIGURE_DEPENDS "cambricon/*/*.mlu")
find_program(CNCC_COMPILER cncc HINTS "${NEUWARE_HOME}/bin" "$ENV{NEUWARE_HOME}/bin" /usr/local/neuware/bin)
if(CNCC_COMPILER)
message(STATUS "Found cncc: ${CNCC_COMPILER}")
set(MLU_COMPILE_OPTS
-c --bang-mlu-arch=mtp_592 -O3 -fPIC -Wall -Werror -std=c++17 -pthread
-I${CMAKE_CURRENT_SOURCE_DIR} -I${NEUWARE_HOME}/include
-idirafter /usr/local/neuware/lib/clang/11.1.0/include
)
function(compile_mlu_file src_file)
get_filename_component(name ${src_file} NAME_WE)
get_filename_component(path ${src_file} DIRECTORY)
set(out_file "${CMAKE_CURRENT_BINARY_DIR}/${path}/${name}.o")
file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${path}")
add_custom_command(OUTPUT ${out_file}
COMMAND ${CNCC_COMPILER} ${MLU_COMPILE_OPTS} -c ${src_file} -o ${out_file}
DEPENDS ${src_file}
COMMENT "Building MLU kernel: ${src_file}"
)
set_property(DIRECTORY APPEND PROPERTY CAMBRICON_OBJECTS ${out_file})
endfunction()
foreach(src ${CAMBRICON_MLU_SOURCES})
compile_mlu_file(${src})
endforeach()
get_directory_property(CAMBRICON_OBJECT_FILES CAMBRICON_OBJECTS)
if(CAMBRICON_OBJECT_FILES)
target_sources(infiniops PRIVATE ${CAMBRICON_OBJECT_FILES})
endif()
else()
message(WARNING "cncc compiler not found. MLU kernels will not be compiled.")
endif()
target_compile_definitions(infiniops PRIVATE WITH_CAMBRICON=1)
target_include_directories(infiniops PUBLIC "${NEUWARE_HOME}/include")
target_link_libraries(infiniops PUBLIC ${CAMBRICON_RUNTIME_LIB} ${CAMBRICON_CNNL_LIB} ${CAMBRICON_CNNL_EXTRA_LIB} ${CAMBRICON_PAPI_LIB})

if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
target_compile_options(infiniops PUBLIC
"$<$<COMPILE_LANGUAGE:CXX>:SHELL:-idirafter /usr/local/neuware/lib/clang/11.1.0/include>"
)
endif()
list(APPEND DEVICE_LIST "cambricon")
endif()

target_include_directories(infiniops PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})

if(GENERATE_PYTHON_BINDINGS)
execute_process(
COMMAND python ${PROJECT_SOURCE_DIR}/scripts/generate_wrappers.py --devices ${DEVICE_LIST}
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
RESULT_VARIABLE script_result
)

if(NOT script_result EQUAL 0)
message(FATAL_ERROR "Generating wrappers - failed")
else()
message(STATUS "Generating wrappers - done")
endif()

set(PYBIND11_SOURCES "${PROJECT_SOURCE_DIR}/generated/bindings/ops.cc")

# TODO: There might be a better solution.
if(WITH_NVIDIA OR WITH_ILUVATAR)
set_source_files_properties(${PYBIND11_SOURCES} PROPERTIES LANGUAGE CUDA)
endif()

find_package(Python COMPONENTS Interpreter Development)
find_package(pybind11 CONFIG)

if(PYBIND11_ENABLE_EXTRAS)
pybind11_add_module(ops ${PYBIND11_SOURCES})
else()
pybind11_add_module(ops NO_EXTRAS ${PYBIND11_SOURCES})
endif()

target_include_directories(ops PRIVATE ${PROJECT_SOURCE_DIR})
target_link_libraries(ops PRIVATE infiniops)

set_target_properties(infiniops PROPERTIES INSTALL_RPATH "$ORIGIN")
set_target_properties(ops PROPERTIES INSTALL_RPATH "$ORIGIN")

install(TARGETS infiniops ops DESTINATION .)

file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/__init__.py" "")
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/__init__.py" DESTINATION .)
endif()
endif()
10 changes: 6 additions & 4 deletions src/base/rms_norm.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,17 @@ namespace infini::ops {
class RmsNorm : public Operator<RmsNorm> {
public:
RmsNorm(const Tensor input, const Tensor weight, float eps, Tensor out)
: eps_{eps},
: input_shape_{input.shape()},
out_shape_{out.shape()},
input_shape_{input.shape()},
out_strides_{out.strides()},
input_strides_{input.strides()},
out_strides_{out.strides()},
eps_{eps},
dim_{out.size(-1)},
ndim_{out.ndim()},
batch_size_{ndim_ == 2 ? out.size(-2) : out.size(-3)},
nhead_{ndim_ == 2 ? 1 : out.size(-2)} {}
nhead_{ndim_ == 2 ? 1 : out.size(-2)} {
assert(input.dtype() == out.dtype());
}

RmsNorm(const Tensor input, const Tensor weight, Tensor out)
: RmsNorm{input, weight, 1e-6f, out} {}
Expand Down
100 changes: 100 additions & 0 deletions src/cambricon/cast.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
#ifndef INFINI_OPS_COMMON_CAMBRICON_CAST_H_
#define INFINI_OPS_COMMON_CAMBRICON_CAST_H_

#include "bang_fp16.h"
#include "bang_bf16.h"

#include "data_type.h"

namespace infini::ops {

namespace detail {

template <typename T>
using PureType = std::remove_cv_t<std::remove_reference_t<T>>;

template <typename T>
__host__ __device__ constexpr float ToFloatHelper(T&& x) {
using PureSrc = PureType<T>;
if constexpr (IsBFloat16<PureSrc>) {
return __bfloat162float__(x);
} else if constexpr (IsFP16<PureSrc>) {
return __half2float(x);
} else {
return static_cast<float>(std::forward<T>(x));
}
}

template <typename Dst>
__host__ __device__ constexpr Dst FromFloatHelper(float f) {
using PureDst = PureType<Dst>;
if constexpr (IsBFloat16<PureDst>) {
return __float2bfloat16__(f);
} else if constexpr (IsFP16<PureDst>) {
return __float2half__(f);
} else {
return static_cast<Dst>(f);
}
}

// Priority tags for overload resolution.
struct PriorityLow {};

struct PriorityHigh : PriorityLow {};

// Fallback: lowest priority. This always matches if nothing else does.
template <typename Dst, typename Src>
__host__ __device__ constexpr Dst HardwareCast(Src&& x, PriorityLow) {
return FromFloatHelper<Dst>(ToFloatHelper(std::forward<Src>(x)));
}

// Usage: `DEFINE_DIRECT_CAST(INTRINSIC, CONDITION)`.
#define DEFINE_DIRECT_CAST(INTRINSIC, ...) \
template <typename Dst, typename Src> \
__host__ __device__ auto HardwareCast(Src x, PriorityHigh) \
->std::enable_if_t<(__VA_ARGS__), \
decltype(INTRINSIC(std::declval<Src>()))> { \
return INTRINSIC(x); \
}

DEFINE_DIRECT_CAST(
__bfloat162int_rz__,
std::is_same_v<PureType<Dst>, int>&& IsBFloat16<PureType<Src>>)
DEFINE_DIRECT_CAST(
__bfloat162short_rz__,
std::is_same_v<PureType<Dst>, short>&& IsBFloat16<PureType<Src>>)
DEFINE_DIRECT_CAST(
__int2bfloat16_rn__,
IsBFloat16<PureType<Dst>>&& std::is_same_v<PureType<Src>, int>)
DEFINE_DIRECT_CAST(__int2half_rn__,
IsFP16<PureType<Dst>>&& std::is_same_v<PureType<Src>, int>)
DEFINE_DIRECT_CAST(
__float2bfloat16__,
IsBFloat16<PureType<Dst>>&& std::is_same_v<PureType<Src>, double>)
DEFINE_DIRECT_CAST(
__float2half__,
IsFP16<PureType<Dst>>&& std::is_same_v<PureType<Src>, double>)
DEFINE_DIRECT_CAST(__half, IsFP16<PureType<Dst>>&& IsBFloat16<PureType<Src>>)
#undef DEFINE_DIRECT_CAST

} // namespace detail

template <typename Dst, typename Src>
__host__ __device__ Dst Cast(Src&& x) {
static_assert(!std::is_reference_v<Dst>,
"`Cast` cannot return reference types");

using PureSrc = std::remove_cv_t<std::remove_reference_t<Src>>;
using PureDst = std::remove_cv_t<std::remove_reference_t<Dst>>;

if constexpr (std::is_same_v<PureSrc, PureDst>) {
return std::forward<Src>(x);
} else {
return detail::HardwareCast<PureDst>(std::forward<Src>(x),
detail::PriorityHigh{});
}
}

} // namespace infini::ops

#endif
Loading