Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
a23984f
check grad before using ipex (#1358)
jiqing-feng Sep 19, 2024
e8881be
Enable packaging for ROCm 6.2 (#1367)
pnunna93 Sep 20, 2024
0d3d977
Update for VS2022 17.11 compatibility with CUDA < 12.4 (#1341)
matthewdouglas Sep 9, 2024
e72637c
Enable continuous releases for multi-backend-refactor branch
matthewdouglas Sep 26, 2024
662dc60
Update release workflow
matthewdouglas Sep 26, 2024
3227cdd
Publish continuous release for multi-backend
matthewdouglas Sep 26, 2024
0a2b539
continuous release: revert wheel renaming due to install err
Titus-von-Koeller Sep 27, 2024
8c5499e
Revert "continuous release: revert wheel renaming due to install err"
Titus-von-Koeller Sep 27, 2024
02d5b42
add dynamic tag-based versioning + git hash for dev vers
Titus-von-Koeller Sep 27, 2024
6927dcc
docs: update w/ changes from `main`
Titus-von-Koeller Sep 27, 2024
8dcd971
get tags for dynamic versioning
Titus-von-Koeller Sep 27, 2024
09ac7ec
fine-tune continuous release params
Titus-von-Koeller Sep 30, 2024
cc56a30
reduce the pkg size + build times for the preview release
Titus-von-Koeller Sep 30, 2024
5225ebe
refine docs for multi-backend alpha release (#1380)
Titus-von-Koeller Sep 30, 2024
e6cc109
docs: remove 2 obsolete lines
Titus-von-Koeller Oct 1, 2024
cd3cb68
Remove depth option in installation steps (#1395)
pnunna93 Oct 16, 2024
cd73601
Fix issue that no valid semantic version tag found when installing bi…
ji-huazhong Nov 20, 2024
b2ac423
Enable XPU and optimize cpu/xpu op (#1418)
jiqing-feng Nov 29, 2024
9315692
fix cpu nf4 (#1432)
jiqing-feng Dec 2, 2024
9948333
Add Ascend NPU support for nf4 quant (#1422)
ji-huazhong Dec 6, 2024
7e6f865
fix device check (#1453)
jiqing-feng Dec 17, 2024
f6025bc
Enable double quant on Intel CPU and XPU (#1472)
jiqing-feng Jan 22, 2025
307fbd5
Enable dequant+matmul 8bit path for Intel CPU and XPU (#1484)
jiqing-feng Jan 28, 2025
a0a95fd
add device index (#1489)
faaany Jan 28, 2025
0dbebb9
IFU-master-2025-02-07
Feb 7, 2025
3ac9d38
IFU-master-2025-02-07
Lzy17 Feb 7, 2025
057bf4e
fix _functions
Lzy17 Feb 13, 2025
250eb5e
Update installation.mdx
Lzy17 Feb 21, 2025
a4e6852
Update installation.mdx to the correct version
Lzy17 Feb 21, 2025
6dedda4
Fix installation.mdx
Lzy17 Feb 21, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 50 additions & 2 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ jobs:
# This job matrix builds the CUDA versions of the libraries for platforms that support CUDA (Linux x64/aarch64 + Windows x64)
##
build-shared-libs-cuda:
if: github.ref_name != 'multi-backend-refactor'
strategy:
matrix:
os: [ubuntu-latest, windows-latest]
Expand Down Expand Up @@ -148,7 +149,7 @@ jobs:
build-wheels:
needs:
- build-shared-libs
- build-shared-libs-cuda
# - build-shared-libs-cuda reduce the pkg size + build times for the preview release
- build-shared-libs-rocm
strategy:
matrix:
Expand All @@ -166,6 +167,13 @@ jobs:
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 1 # shallow clone
- name: Fetch tags for dynamic versioning in setup.py
run: |
git fetch --depth=1 origin --tags
echo "Available Git tags:"
git tag -n
- name: Download build artifact
uses: actions/download-artifact@v4
with:
Expand All @@ -183,7 +191,8 @@ jobs:
python-version: ${{ matrix.python-version }}
cache: pip
- run: pip install build wheel
- run: python -m build .
# for now need to do the below instead of prior `python -m build .`, which didn't allow us to access git tags
- run: python -m build --sdist && python -m build --wheel
- name: Determine and Set Platform Tag, then Tag Wheel
shell: bash
run: |
Expand All @@ -197,6 +206,45 @@ jobs:
path: dist/bitsandbytes-*.whl
retention-days: 7

upload-pre-release-wheels:
name: Create release and upload artifacts
runs-on: ubuntu-latest
if: github.ref_name == 'multi-backend-refactor'
permissions:
contents: write
needs:
- build-wheels
steps:
- name: Download and rename artifacts
uses: actions/download-artifact@v4
with:
path: tmp/
pattern: "bdist_wheel_*"
merge-multiple: true
- name: Inspect tmp directory after downloading artifacts
run: ls -alFR tmp/
- name: Move and rename wheel files with pattern replacement
run: |
mkdir -p wheels/
find tmp/ -type f -name '*.whl' -print0 | while IFS= read -r -d '' wheel; do
wheel_filename=$(basename "$wheel")
# Remove the gith hash, e.g. `+1234567`, for a stable download link on the multi-backend pre-release
cleaned_filename=$(echo "$wheel_filename" | sed -E 's/\+[0-9a-f]{7}-/-/g')
mv "$wheel" "wheels/$cleaned_filename"
done
- name: Inspect wheels directory after renaming files
run: ls -alFR wheels/
- name: Create release and upload artifacts
uses: softprops/action-gh-release@v2.0.8
with:
files: wheels/*.whl
prerelease: true
name: Multi-Backend Preview
tag_name: continuous-release_multi-backend-refactor
make_latest: false
draft: false
target_commitish: ${{ github.sha }}

audit-wheels:
needs: build-wheels
runs-on: ubuntu-latest
Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,8 @@ dmypy.json
# vim
*.swp

# BNB-specific stuff
dependencies
cuda_build
output/
bitsandbytes/_version.py
54 changes: 50 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# For GCC: `cmake -B build . && cmake --build build`
# For MSVC: `cmake -B build . && cmake --build build --config Release`
# You can also use the following options and variables
# - COMPUTE_BACKEND: Set to `cpu`, `cuda`, `hip` or `mps` to select the backend
# - COMPUTE_BACKEND: Set to `cpu`, `cuda`, `hip`, `mps` or `npu` to select the backend
# - NO_CUBLASLT: Default OFF, will skip building/linking CUBLASLT support
# - CUDA_VERSION: The expected CUDA version, for sanity checking. The actual version
# is whatever CMake finds on your path.
Expand All @@ -29,11 +29,12 @@ set(CUDA_FILES csrc/ops.cu csrc/kernels.cu)
set(HIP_FILES csrc/ops.hip csrc/kernels.hip)
set(MPS_FILES csrc/mps_ops.mm)
set(METAL_FILES csrc/mps_kernels.metal)
set(NPU_FILES csrc/npu_ops.cpp)
# C++ sources are always included
list(APPEND SRC_FILES ${CPP_FILES})

set(COMPUTE_BACKEND "cpu" CACHE STRING "The compute backend to use (cpu, cuda, hip, mps)")
set_property(CACHE COMPUTE_BACKEND PROPERTY STRINGS cpu cuda hip mps)
set(COMPUTE_BACKEND "cpu" CACHE STRING "The compute backend to use (cpu, cuda, hip, mps, npu)")
set_property(CACHE COMPUTE_BACKEND PROPERTY STRINGS cpu cuda hip mps npu)
option(PTXAS_VERBOSE "Pass through -v flag to PTX Assembler" OFF)

if(APPLE)
Expand Down Expand Up @@ -69,6 +70,11 @@ elseif(${COMPUTE_BACKEND} STREQUAL "mps")
set(BUILD_CUDA OFF)
set(BUILD_HIP OFF)
set(BUILD_MPS ON)
elseif(${COMPUTE_BACKEND} STREQUAL "npu")
set(BUILD_CUDA OFF)
set(BUILD_HIP OFF)
set(BUILD_MPS OFF)
set(BUILD_NPU ON)
else()
set(BUILD_CUDA OFF)
set(BUILD_HIP OFF)
Expand All @@ -82,6 +88,11 @@ if(BUILD_CUDA)
# This needs to be added *before* we try to enable the CUDA language so CMake's compiler check passes.
if(MSVC AND MSVC_VERSION VERSION_GREATER_EQUAL 1940)
string(APPEND CMAKE_CUDA_FLAGS " --allow-unsupported-compiler")

# This is needed to build with VS2022 17.11+ and CUDA < 12.4.
if (MSVC_VERSION VERSION_GREATER_EQUAL 1941)
string(APPEND CMAKE_CUDA_FLAGS " -D_ALLOW_COMPILER_AND_STL_VERSION_MISMATCH")
endif()
endif()

enable_language(CUDA) # This will fail if CUDA is not found
Expand Down Expand Up @@ -227,6 +238,33 @@ elseif(BUILD_MPS)
COMMENT "Compiling Metal kernels"
VERBATIM)
add_custom_target(metallib DEPENDS "bitsandbytes/bitsandbytes.metallib")
elseif(BUILD_NPU)
list(APPEND SRC_FILES ${NPU_FILES})

set(SOC_VERSION "Ascend910B4" CACHE STRING "system on chip type")
set(ASCEND_CANN_PACKAGE_PATH $ENV{ASCEND_HOME_PATH} CACHE
STRING "ASCEND CAN package installation directory"
)

# ${KERNEL_FILES} are used to compile library, push files written by ascendc in ${KERNEL_FILES}.
# ref to cmake/npu.cmake ascendc_library, cmake/cpu.cmake add_library
# file(GLOB KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/csrc/npu_kernels.cpp)
file(GLOB KERNEL_FILES csrc/npu_kernels.cpp)

if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
else()
message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the can package is installed")
endif()
include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)

# ascendc_library use to add kernel file to generate ascendc library
ascendc_library(ascendc_kernels_npu STATIC ${KERNEL_FILES})

string(APPEND BNB_OUTPUT_NAME "_npu")
add_compile_definitions(BUILD_NPU)
else()
string(APPEND BNB_OUTPUT_NAME "_cpu")
set(GPU_SOURCES)
Expand All @@ -244,7 +282,11 @@ endif()

set_source_files_properties(${CPP_FILES} PROPERTIES LANGUAGE CXX)
add_library(bitsandbytes SHARED ${SRC_FILES})
target_compile_features(bitsandbytes PUBLIC cxx_std_14)
if(BUILD_NPU)
target_compile_features(bitsandbytes PUBLIC cxx_std_17)
else()
target_compile_features(bitsandbytes PUBLIC cxx_std_14)
endif()
target_include_directories(bitsandbytes PUBLIC csrc include)


Expand Down Expand Up @@ -301,6 +343,10 @@ if(BUILD_MPS)
add_dependencies(bitsandbytes metallib)
target_link_libraries(bitsandbytes objc "-framework Foundation" "-framework Metal" "-framework MetalPerformanceShaders" "-framework MetalPerformanceShadersGraph")
endif()
if(BUILD_NPU)
target_compile_options(bitsandbytes PRIVATE -O2 -std=c++17)
target_link_libraries(bitsandbytes PRIVATE $<BUILD_INTERFACE:host_intf_pub> ascendc_kernels_npu)
endif()

if(WIN32)
set_target_properties(bitsandbytes PROPERTIES PREFIX "lib")
Expand Down
3 changes: 3 additions & 0 deletions _typos.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,15 @@
[default]
extend-ignore-re = [
"@Ther-nul", # valid Github user
"CANN", # CANN (Compute Architecture for Neural Networks) is a heterogeneous computing architecture for Ascend NPU
]

[default.extend-identifiers]

[type.py.extend-words]
"BA" = "BA" # used as a commented-out variable in tests
"cann" = "cann" # cann (Compute Architecture for Neural Networks) is a heterogeneous computing architecture for Ascend NPU


[type.cuda.extend-words]
"subtile" = "subtile"
Expand Down
14 changes: 10 additions & 4 deletions bitsandbytes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

# Import the dynamically generated version from _version.py (see setup.py)
from ._version import __version__ # isort: skip # type: ignore

import torch

from . import research, utils
Expand All @@ -14,15 +17,15 @@
matmul_cublas,
mm_cublas,
)
from .backends import register_backend
from .backends import backends, register_backend
from .backends.cpu import CPUBackend
from .backends.npu import NPUBackend
from .cextension import lib
from .nn import modules

features = {"multi_backend"}
supported_torch_devices = {
"cuda", # includes ROCm
"npu", # Ascend NPU
"xpu", # Intel GPU
"cpu",
}
Expand Down Expand Up @@ -61,6 +64,11 @@
if hasattr(torch, "npu") and torch.npu.is_available():
register_backend("npu", NPUBackend())


# import module after decided backends
if backends:
from .nn import modules

# TODO: Other potential backends:
# XLA - Google TPU / PJRT runtime
# HPU - Habana / Intel Gaudi
Expand All @@ -73,5 +81,3 @@
"optim.optimizer.Optimizer8bit": False,
"optim.optimizer.MockArgs": False,
}

__version__ = "0.43.3.dev"
55 changes: 49 additions & 6 deletions bitsandbytes/autograd/_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ def backward(ctx, grad_output):

def supports_igemmlt(device: torch.device) -> bool:
"""check if this device supports the optimized int8 kernel"""
if device == torch.device("cpu"):
if device == torch.device("cpu") or torch.device("xpu"):
return True
if torch.version.hip:
return False if BNB_HIP_VERSION < 601 else True
Expand Down Expand Up @@ -463,7 +463,9 @@ def backward(ctx, grad_output):
if len(grad_output.shape) == 3:
grad_output = grad_output.reshape(-1, grad_output.shape[-1]).contiguous()

Cgrad, Cgradt, SCgrad, SCgradt, coo_tensor = F.double_quant(grad_output.to(torch.float16))
Cgrad, Cgradt, SCgrad, SCgradt, coo_tensor = None, None, None, None, None
if req_gradB or (req_gradA and state.CBt is not None):
Cgrad, Cgradt, SCgrad, SCgradt, coo_tensor = F.double_quant(grad_output.to(torch.float16))
if req_gradB:
CxAt, SAt = F.transform(CAt, formatB, transpose=True)
C32grad, Sgrad = F.transform(Cgradt, "col32", transpose=True)
Expand Down Expand Up @@ -517,7 +519,12 @@ def forward(ctx, A, B, out=None, bias=None, quant_state: Optional[F.QuantState]

# 1. Dequantize
# 2. MatmulnN
output = torch.nn.functional.linear(A, F.dequantize_4bit(B, quant_state).to(A.dtype).t(), bias)
if A.device.type == "npu":
output = torch.matmul(A, F.dequantize_4bit(B, quant_state).to(A.dtype).t())
if bias is not None:
output += bias
else:
output = torch.nn.functional.linear(A, F.dequantize_4bit(B, quant_state).to(A.dtype).t(), bias)

# 3. Save state
ctx.state = quant_state
Expand Down Expand Up @@ -548,11 +555,37 @@ def backward(ctx, grad_output):
# not supported by PyTorch. TODO: create work-around
# if req_gradB: grad_B = torch.matmul(grad_output.t(), A)
if req_gradA:
grad_A = torch.matmul(grad_output, F.dequantize_4bit(B, ctx.state).to(grad_output.dtype).t())
if grad_output.device.type == "npu":
grad_A = torch.matmul(grad_output, F.dequantize_4bit(B, ctx.state).to(grad_output.dtype))
else:
grad_A = torch.matmul(grad_output, F.dequantize_4bit(B, ctx.state).to(grad_output.dtype).t())

return grad_A, grad_B, None, grad_bias, None


class MatMul8bitFp(torch.autograd.Function):
# For Intel CPU and XPU, the double quant has many unsafe operations which will breaks the finetune.
# We'd like to use dequant + matmul to run finetune currently.

@staticmethod
def forward(ctx, A, B, out=None, bias=None, state=MatmulLtState):
CB = B.data.to(A.dtype).mul_(state.SCB.unsqueeze(1).mul(1.0 / 127.0)).t()
output = torch.matmul(A, CB).to(A.dtype)
ctx.state = state
ctx.dtype_A = A.dtype
ctx.grad_shape = A.shape
return output

@staticmethod
def backward(ctx, grad_output):
state = ctx.state
B = state.CxB if state.CxB is not None else state.CB
CB = B.to(ctx.dtype_A).mul_(state.SCB.unsqueeze(1).mul(1.0 / 127.0))
grad_A = torch.matmul(grad_output, CB).view(ctx.grad_shape).to(ctx.dtype_A)

return grad_A, None, None, None, None


def matmul(
A: torch.Tensor,
B: torch.Tensor,
Expand All @@ -564,6 +597,8 @@ def matmul(
state = state or MatmulLtState()
if threshold > 0.0:
state.threshold = threshold
if A.device.type in ("cpu", "xpu") and state.is_training:
return MatMul8bitFp.apply(A, B, out, bias, state)
return MatMul8bitLt.apply(A, B, out, bias, state)


Expand All @@ -575,8 +610,16 @@ def matmul_4bit(
bias=None,
):
assert quant_state is not None
if (A.numel() == A.shape[-1] or A.device.type == "cpu") and A.requires_grad == False:
# CPU backend does not require A to be a vector
if A.device.type in ("cpu", "xpu") and A.requires_grad == False:
if getattr(quant_state, "ipex", False):
B = B.t() if len(B.shape) == 2 else B
out = F.gemv_4bit(A, B, out, state=quant_state)
if bias is not None:
out += bias
return out
else:
return MatMul4Bit.apply(A, B, out, bias, quant_state)
elif A.numel() == A.shape[-1] and A.requires_grad == False and A.device.type != "npu":
if A.shape[-1] % quant_state.blocksize != 0:
warn(
f"Some matrices hidden dimension is not a multiple of {quant_state.blocksize} and efficient inference kernels are not supported for these (slow). Matrix input size found: {A.shape}",
Expand Down
Loading