From a23984fed5f87f24348d2e8f10e8792853d5eaed Mon Sep 17 00:00:00 2001
From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com>
Date: Fri, 20 Sep 2024 07:35:40 +0800
Subject: [PATCH 01/29] check grad before using ipex (#1358)

---
 bitsandbytes/nn/modules.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index ad424a6f4..32854413f 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -471,6 +471,7 @@ def forward(self, x: torch.Tensor):
             and not hasattr(self.weight.quant_state, "op_context")
             and self.weight.quant_state.shape[1] % self.weight.quant_state.blocksize == 0
             and self.weight.quant_state.quant_type == "nf4"
+            and x.requires_grad == False
         ):
             enable_ipex_fusion(self.weight, self.weight.quant_state)
 

From e8881bef17a4666ac5fee65a73bf337cdc8ca547 Mon Sep 17 00:00:00 2001
From: pnunna93 <104791500+pnunna93@users.noreply.github.com>
Date: Fri, 20 Sep 2024 15:54:58 -0500
Subject: [PATCH 02/29] Enable packaging for ROCm 6.2 (#1367)

* Enable 6.2 build

* Update documentation for 6.2.0 pip install
---
 .github/workflows/python-package.yml | 2 +-
 docs/source/installation.mdx         | 2 +-
 tests/test_functional.py             | 1 +
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index d2da82501..21c4c1895 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -107,7 +107,7 @@ jobs:
         os: [ubuntu-latest]
         arch: [x86_64]
         rocm_version:
-          ["6.1.2"]
+          ["6.1.2", "6.2"]
     runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents
     steps:
       - uses: actions/checkout@v4
diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index 60419b38a..146fb0ddd 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -167,7 +167,7 @@ apt-get update && apt-get install -y git && cd home
 pip install torch --index-url https://download.pytorch.org/whl/rocm6.1/
 
 # Install bitsandbytes from PyPI
-# (This is supported on Ubuntu 22.04, Python 3.10, ROCm 6.1.0/6.1.1/6.1.2 and gpu arch - gfx90a, gfx942, gfx1100
+# (This is supported on Ubuntu 22.04, Python 3.10, ROCm 6.1.0/6.1.1/6.1.2/6.2.0 and gpu arch - gfx90a, gfx942, gfx1100
 # Please install from source if your configuration doesn't match with these)
 pip install bitsandbytes
 
diff --git a/tests/test_functional.py b/tests/test_functional.py
index a9d926b89..35187db78 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -2303,6 +2303,7 @@ def test_gemv_4bit(dtype, storage_type, quant_storage, double_quant, kind):
             assert maxratio < 1.02 and maxratio > 0.98
 
 
+@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
 @pytest.mark.parametrize("quant_type", ["nf4", "fp4"])
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)

From 0d3d977c8f9fab7193345a4dc8f2e19c9bb35db3 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Mon, 9 Sep 2024 14:31:38 -0400
Subject: [PATCH 03/29] Update for VS2022 17.11 compatibility with CUDA < 12.4
 (#1341)

* Update for VS2022 17.11 compatibility with CUDA < 12.4

* Try again
---
 CMakeLists.txt | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index eac72fe52..315e0ff1b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -82,6 +82,11 @@ if(BUILD_CUDA)
     # This needs to be added *before* we try to enable the CUDA language so CMake's compiler check passes.
     if(MSVC AND MSVC_VERSION VERSION_GREATER_EQUAL 1940)
         string(APPEND CMAKE_CUDA_FLAGS " --allow-unsupported-compiler")
+
+        # This is needed to build with VS2022 17.11+ and CUDA < 12.4.
+        if (MSVC_VERSION VERSION_GREATER_EQUAL 1941)
+            string(APPEND CMAKE_CUDA_FLAGS " -D_ALLOW_COMPILER_AND_STL_VERSION_MISMATCH")
+        endif()
     endif()
 
     enable_language(CUDA) # This will fail if CUDA is not found

From e72637c99cd314a0b840615754fb4e433875b550 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Thu, 26 Sep 2024 09:45:42 -0400
Subject: [PATCH 04/29] Enable continuous releases for multi-backend-refactor
 branch

---
 .github/workflows/python-package.yml | 50 ++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 21c4c1895..3aeeef9ba 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -197,6 +197,56 @@ jobs:
           path: dist/bitsandbytes-*.whl
           retention-days: 7
 
+  upload-pre-release-wheels:
+    name: Create release and upload artifacts
+    runs-on: ubuntu-latest
+    if: github.ref_name == 'multi-backend-refactor'
+    permissions:
+      contents: write
+    needs:
+      - build-wheels
+    steps:
+      - name: Download artifacts to tmp directory
+        uses: actions/download-artifact@v4
+        with:
+          path: tmp/
+          pattern: "bdist_wheel_*"
+          merge-multiple: true
+      - name: Inspect tmp directory after downloading artifacts
+        run: ls -alFR tmp/
+      - name: Move and rename wheel files
+        run: |
+          mkdir -p wheels/
+          find tmp/ -type f -name '*.whl' -print0 | while IFS= read -r -d '' wheel; do
+            wheel_filename=$(basename "$wheel")
+            if [[ $wheel_filename == *linux*x86_64* ]]; then
+              mv "$wheel" wheels/bnb-linux-x86_64.whl
+            elif [[ $wheel_filename == *linux*aarch64* ]]; then
+              mv "$wheel" wheels/bnb-linux-aarch64.whl
+            elif [[ $wheel_filename == *macosx*x86_64* ]]; then
+              mv "$wheel" wheels/bnb-macos-x86_64.whl
+            elif [[ $wheel_filename == *macosx*arm64* ]]; then
+              mv "$wheel" wheels/bnb-macos-arm64.whl
+            elif [[ $wheel_filename == *win*amd64* ]]; then
+              mv "$wheel" wheels/bnb-windows-x86_64.whl
+            else
+              echo "Unknown wheel format: $wheel_filename"
+              exit 1
+            fi
+          done
+      - name: Inspect wheels directory after renaming files
+        run: ls -alFR wheels/
+      - name: Create release and upload artifacts
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_CONTINUOUS_RELEASE_NAME: Multi-Backend Preview
+          GITHUB_CONTINUOUS_RELEASE_TYPE: prerelease
+          GITHUB_CONTINUOUS_RELEASE_TAG: continuous-release_preview
+        run: |
+          wget -q https://github.com/TheAssassin/pyuploadtool/releases/download/continuous/pyuploadtool-x86_64.AppImage
+          chmod +x pyuploadtool-x86_64.AppImage
+          ./pyuploadtool-x86_64.AppImage --appimage-extract-and-run wheels/*.whl
+
   audit-wheels:
     needs: build-wheels
     runs-on: ubuntu-latest

From 662dc6057ad95207fe27fdd3925dd5c4094a8488 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Thu, 26 Sep 2024 13:01:34 -0400
Subject: [PATCH 05/29] Update release workflow

---
 .github/workflows/python-package.yml | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 3aeeef9ba..77316967d 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -237,15 +237,15 @@ jobs:
       - name: Inspect wheels directory after renaming files
         run: ls -alFR wheels/
       - name: Create release and upload artifacts
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          GITHUB_CONTINUOUS_RELEASE_NAME: Multi-Backend Preview
-          GITHUB_CONTINUOUS_RELEASE_TYPE: prerelease
-          GITHUB_CONTINUOUS_RELEASE_TAG: continuous-release_preview
-        run: |
-          wget -q https://github.com/TheAssassin/pyuploadtool/releases/download/continuous/pyuploadtool-x86_64.AppImage
-          chmod +x pyuploadtool-x86_64.AppImage
-          ./pyuploadtool-x86_64.AppImage --appimage-extract-and-run wheels/*.whl
+        uses: softprops/action-gh-release@v2.0.8
+        with:
+          files: wheels/*.whl
+          prerelease: true
+          name: Multi-Backend Preview
+          tag_name: continuous-release-preview
+          make_latest: false
+          draft: true
+          target_commitish: ${{ github.ref_name }}
 
   audit-wheels:
     needs: build-wheels

From 3227cdd366770c1e7b40eff3bf43dbbe012b6a9e Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Thu, 26 Sep 2024 13:20:00 -0400
Subject: [PATCH 06/29] Publish continuous release for multi-backend

---
 .github/workflows/python-package.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 77316967d..37e52be6c 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -244,7 +244,7 @@ jobs:
           name: Multi-Backend Preview
           tag_name: continuous-release-preview
           make_latest: false
-          draft: true
+          draft: false
           target_commitish: ${{ github.ref_name }}
 
   audit-wheels:

From 0a2b5392ff079645fdc9ff887f80d327f9e874f7 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Fri, 27 Sep 2024 15:09:59 +0000
Subject: [PATCH 07/29] continuous release: revert wheel renaming due to
 install err

---
 .github/workflows/python-package.yml | 32 +++++-----------------------
 1 file changed, 5 insertions(+), 27 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 37e52be6c..42d3d0957 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -206,40 +206,18 @@ jobs:
     needs:
       - build-wheels
     steps:
-      - name: Download artifacts to tmp directory
+      - name: Download artifacts
         uses: actions/download-artifact@v4
         with:
-          path: tmp/
+          path: artifacts/
           pattern: "bdist_wheel_*"
           merge-multiple: true
-      - name: Inspect tmp directory after downloading artifacts
-        run: ls -alFR tmp/
-      - name: Move and rename wheel files
-        run: |
-          mkdir -p wheels/
-          find tmp/ -type f -name '*.whl' -print0 | while IFS= read -r -d '' wheel; do
-            wheel_filename=$(basename "$wheel")
-            if [[ $wheel_filename == *linux*x86_64* ]]; then
-              mv "$wheel" wheels/bnb-linux-x86_64.whl
-            elif [[ $wheel_filename == *linux*aarch64* ]]; then
-              mv "$wheel" wheels/bnb-linux-aarch64.whl
-            elif [[ $wheel_filename == *macosx*x86_64* ]]; then
-              mv "$wheel" wheels/bnb-macos-x86_64.whl
-            elif [[ $wheel_filename == *macosx*arm64* ]]; then
-              mv "$wheel" wheels/bnb-macos-arm64.whl
-            elif [[ $wheel_filename == *win*amd64* ]]; then
-              mv "$wheel" wheels/bnb-windows-x86_64.whl
-            else
-              echo "Unknown wheel format: $wheel_filename"
-              exit 1
-            fi
-          done
-      - name: Inspect wheels directory after renaming files
-        run: ls -alFR wheels/
+      - name: Inspect artifacts directory after downloading
+        run: ls -alFR artifacts/
       - name: Create release and upload artifacts
         uses: softprops/action-gh-release@v2.0.8
         with:
-          files: wheels/*.whl
+          files: artifacts/**/*.whl
           prerelease: true
           name: Multi-Backend Preview
           tag_name: continuous-release-preview

From 8c5499e7498112fbdf172d2cba0d92a505ecef44 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Fri, 27 Sep 2024 17:38:12 +0000
Subject: [PATCH 08/29] Revert "continuous release: revert wheel renaming due
 to install err"

This reverts commit 0a2b5392ff079645fdc9ff887f80d327f9e874f7.
---
 .github/workflows/python-package.yml | 32 +++++++++++++++++++++++-----
 1 file changed, 27 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 42d3d0957..37e52be6c 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -206,18 +206,40 @@ jobs:
     needs:
       - build-wheels
     steps:
-      - name: Download artifacts
+      - name: Download artifacts to tmp directory
         uses: actions/download-artifact@v4
         with:
-          path: artifacts/
+          path: tmp/
           pattern: "bdist_wheel_*"
           merge-multiple: true
-      - name: Inspect artifacts directory after downloading
-        run: ls -alFR artifacts/
+      - name: Inspect tmp directory after downloading artifacts
+        run: ls -alFR tmp/
+      - name: Move and rename wheel files
+        run: |
+          mkdir -p wheels/
+          find tmp/ -type f -name '*.whl' -print0 | while IFS= read -r -d '' wheel; do
+            wheel_filename=$(basename "$wheel")
+            if [[ $wheel_filename == *linux*x86_64* ]]; then
+              mv "$wheel" wheels/bnb-linux-x86_64.whl
+            elif [[ $wheel_filename == *linux*aarch64* ]]; then
+              mv "$wheel" wheels/bnb-linux-aarch64.whl
+            elif [[ $wheel_filename == *macosx*x86_64* ]]; then
+              mv "$wheel" wheels/bnb-macos-x86_64.whl
+            elif [[ $wheel_filename == *macosx*arm64* ]]; then
+              mv "$wheel" wheels/bnb-macos-arm64.whl
+            elif [[ $wheel_filename == *win*amd64* ]]; then
+              mv "$wheel" wheels/bnb-windows-x86_64.whl
+            else
+              echo "Unknown wheel format: $wheel_filename"
+              exit 1
+            fi
+          done
+      - name: Inspect wheels directory after renaming files
+        run: ls -alFR wheels/
       - name: Create release and upload artifacts
         uses: softprops/action-gh-release@v2.0.8
         with:
-          files: artifacts/**/*.whl
+          files: wheels/*.whl
           prerelease: true
           name: Multi-Backend Preview
           tag_name: continuous-release-preview

From 02d5b423a56908e22edfe3a044de251de13dd231 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Fri, 27 Sep 2024 19:08:29 +0000
Subject: [PATCH 09/29] add dynamic tag-based versioning + git hash for dev
 vers

---
 .github/workflows/python-package.yml | 21 +++++-------------
 .gitignore                           |  2 ++
 bitsandbytes/__init__.py             |  5 +++--
 setup.py                             | 32 +++++++++++++++++++++++++++-
 4 files changed, 41 insertions(+), 19 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 37e52be6c..f655df4f9 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -206,7 +206,7 @@ jobs:
     needs:
       - build-wheels
     steps:
-      - name: Download artifacts to tmp directory
+      - name: Download and rename artifacts
         uses: actions/download-artifact@v4
         with:
           path: tmp/
@@ -214,25 +214,14 @@ jobs:
           merge-multiple: true
       - name: Inspect tmp directory after downloading artifacts
         run: ls -alFR tmp/
-      - name: Move and rename wheel files
+      - name: Move and rename wheel files with pattern replacement
         run: |
           mkdir -p wheels/
           find tmp/ -type f -name '*.whl' -print0 | while IFS= read -r -d '' wheel; do
             wheel_filename=$(basename "$wheel")
-            if [[ $wheel_filename == *linux*x86_64* ]]; then
-              mv "$wheel" wheels/bnb-linux-x86_64.whl
-            elif [[ $wheel_filename == *linux*aarch64* ]]; then
-              mv "$wheel" wheels/bnb-linux-aarch64.whl
-            elif [[ $wheel_filename == *macosx*x86_64* ]]; then
-              mv "$wheel" wheels/bnb-macos-x86_64.whl
-            elif [[ $wheel_filename == *macosx*arm64* ]]; then
-              mv "$wheel" wheels/bnb-macos-arm64.whl
-            elif [[ $wheel_filename == *win*amd64* ]]; then
-              mv "$wheel" wheels/bnb-windows-x86_64.whl
-            else
-              echo "Unknown wheel format: $wheel_filename"
-              exit 1
-            fi
+            # Remove the gith hash, e.g. `+1234567`, for a stable download link on the multi-backend pre-release
+            cleaned_filename=$(echo "$wheel_filename" | sed -E 's/\+[0-9a-f]{7}-/-/g')
+            mv "$wheel" "wheels/$cleaned_filename"
           done
       - name: Inspect wheels directory after renaming files
         run: ls -alFR wheels/
diff --git a/.gitignore b/.gitignore
index 22f5a6cd6..cd1b797bb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -151,6 +151,8 @@ dmypy.json
 # vim
 *.swp
 
+# BNB-specific stuff
 dependencies
 cuda_build
 output/
+bitsandbytes/_version.py
diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
index 1e638eb79..25ec8a79a 100644
--- a/bitsandbytes/__init__.py
+++ b/bitsandbytes/__init__.py
@@ -3,6 +3,9 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
+# Import the dynamically generated version from _version.py  (see setup.py)
+from ._version import __version__  # isort: skip # type: ignore
+
 import torch
 
 from . import research, utils
@@ -73,5 +76,3 @@
     "optim.optimizer.Optimizer8bit": False,
     "optim.optimizer.MockArgs": False,
 }
-
-__version__ = "0.43.3.dev"
diff --git a/setup.py b/setup.py
index 18de0fe5b..2b1c1aff3 100644
--- a/setup.py
+++ b/setup.py
@@ -4,6 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 import glob
 import os
+import subprocess
 
 from setuptools import find_packages, setup
 from setuptools.dist import Distribution
@@ -13,6 +14,35 @@
 print("libs:", libs)
 
 
+def get_git_commit_hash():
+    return subprocess.check_output(["git", "rev-parse", "--short", "HEAD"]).decode("utf-8").strip()
+
+
+def is_git_tagged_commit():
+    tags = subprocess.check_output(["git", "tag", "--points-at", "HEAD"]).decode("utf-8").strip()
+    return bool(tags)
+
+
+def get_latest_semver_tag():
+    tags = subprocess.check_output(["git", "tag"], text=True).splitlines()
+    semver_tags = [tag for tag in tags if tag.count(".") == 2 and all(part.isdigit() for part in tag.split("."))]
+    if not semver_tags:
+        raise ValueError("No valid semantic version tags found")
+    return sorted(semver_tags, key=lambda s: list(map(int, s.split("."))))[-1]
+
+
+def write_version_file(version, filepath="bitsandbytes/_version.py"):
+    with open(filepath, "w") as f:
+        f.write(f'__version__ = "{version}"\n')
+
+
+def get_version_and_write_to_file():
+    latest_semver_tag = get_latest_semver_tag()
+    version = latest_semver_tag if is_git_tagged_commit() else f"{latest_semver_tag}.dev+{get_git_commit_hash()}"
+    write_version_file(version)
+    return version
+
+
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read()
 
@@ -25,7 +55,7 @@ def has_ext_modules(self):
 
 setup(
     name="bitsandbytes",
-    version="0.43.3.dev",
+    version=get_version_and_write_to_file(),
     author="Tim Dettmers",
     author_email="dettmers@cs.washington.edu",
     description="k-bit optimizers and matrix multiplication routines.",

From 6927dcc493562cdec804ffc833627275686b3904 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Fri, 27 Sep 2024 19:50:51 +0000
Subject: [PATCH 10/29] docs: update w/ changes from `main`

---
 docs/source/contributing.mdx      |  5 +++--
 docs/source/installation.mdx      |  2 +-
 docs/source/non_cuda_backends.mdx | 16 +++++++++++++++-
 3 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/docs/source/contributing.mdx b/docs/source/contributing.mdx
index 4fe6b7541..5da42961e 100644
--- a/docs/source/contributing.mdx
+++ b/docs/source/contributing.mdx
@@ -5,8 +5,9 @@
 
 ### Setup pre-commit hooks
 - Install pre-commit hooks with `pip install pre-commit`.
-- Run `pre-commit autoupdate` once to configure the hooks.
-- Re-run `pre-commit autoupdate` every time a new hook got added.
+- Run `pre-commit install` once to install the hooks, so they will be run on every commit.
+- If the hooks introduce changes, they'll be visible with `git diff`. Review them and `git add` them if everything is fine, then re-execute the before commit, it should pass now.
+- If you want to manually trigger the hooks, you may do `pre-commit run --all-files`
 
 Now all the pre-commit hooks will be automatically run when you try to commit and if they introduce some changes, you need to re-add the changed files before being able to commit and push.
 
diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index 146fb0ddd..2f82c199b 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -137,7 +137,7 @@ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/YOUR_USERNAME/local/cuda-11.7
 ## Multi-backend[[multi-backend]]
 
 > [!TIP]
-> This functionality is currently in preview and therefore not yet production-ready!
+> This functionality is currently in preview and therefore not yet production-ready! Please reference [this guide](./non_cuda_backends) for more in-depth information about the different backends and their current status.
 
 Please follow these steps to install bitsandbytes with device-specific backend support other than CUDA:
 
diff --git a/docs/source/non_cuda_backends.mdx b/docs/source/non_cuda_backends.mdx
index fca586534..fc7c6ac27 100644
--- a/docs/source/non_cuda_backends.mdx
+++ b/docs/source/non_cuda_backends.mdx
@@ -24,4 +24,18 @@ Thank you for your support!
 
 ### Intel
 
-### AMD
+The following performance data is collected from Intel 4th Gen Xeon (SPR) platform. The tables show speed-up and memory compared with different data types of [Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf).
+
+#### Inference (CPU)
+
+| Data Type | BF16 | INT8 | NF4 | FP4 |
+|---|---|---|---|---|
+| Speed-Up (vs BF16) | 1.0x | 0.6x | 2.3x | 0.03x |
+| Memory (GB) | 13.1 | 7.6 | 5.0 | 4.6 |
+
+#### Fine-Tuning (CPU)
+
+| Data Type | AMP BF16 | INT8 | NF4 | FP4 |
+|---|---|---|---|---|
+| Speed-Up (vs AMP BF16) | 1.0x | 0.38x | 0.07x | 0.07x |
+| Memory (GB) | 40 | 9 | 6.6 | 6.6 |

From 8dcd971cc11ab3449eea01419ec1676d5d5e53c8 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Fri, 27 Sep 2024 20:24:03 +0000
Subject: [PATCH 11/29] get tags for dynamic versioning

---
 .github/workflows/python-package.yml | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index f655df4f9..9cd9ceb78 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -166,6 +166,13 @@ jobs:
     runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v4
+        with:
+          fetch-depth: 1  # shallow clone
+      - name: Fetch tags for dynamic versioning in setup.py
+        run: |
+          git fetch --depth=1 origin --tags
+          echo "Available Git tags:"
+          git tag -n
       - name: Download build artifact
         uses: actions/download-artifact@v4
         with:
@@ -183,7 +190,8 @@ jobs:
           python-version: ${{ matrix.python-version }}
           cache: pip
       - run: pip install build wheel
-      - run: python -m build .
+      # for now need to do the below instead of prior `python -m build .`, which didn't allow us to access git tags
+      - run: python -m build --sdist && python -m build --wheel
       - name: Determine and Set Platform Tag, then Tag Wheel
         shell: bash
         run: |

From 09ac7ec34f556d74356167ed4214d9e1f3f98bad Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Mon, 30 Sep 2024 18:34:53 +0000
Subject: [PATCH 12/29] fine-tune continuous release params

---
 .github/workflows/python-package.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 9cd9ceb78..f96dd995e 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -239,10 +239,10 @@ jobs:
           files: wheels/*.whl
           prerelease: true
           name: Multi-Backend Preview
-          tag_name: continuous-release-preview
+          tag_name: continuous-release_multi-backend-refactor
           make_latest: false
           draft: false
-          target_commitish: ${{ github.ref_name }}
+          target_commitish: ${{ github.sha }}
 
   audit-wheels:
     needs: build-wheels

From cc56a30e7d54e42328f0a995106828372acaebfe Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Mon, 30 Sep 2024 23:10:12 +0000
Subject: [PATCH 13/29] reduce the pkg size + build times for the preview
 release

---
 .github/workflows/python-package.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index f96dd995e..6a2b3f63e 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -58,6 +58,7 @@ jobs:
   # This job matrix builds the CUDA versions of the libraries for platforms that support CUDA (Linux x64/aarch64 + Windows x64)
   ##
   build-shared-libs-cuda:
+    if: github.ref_name != 'multi-backend-refactor'
     strategy:
       matrix:
         os: [ubuntu-latest, windows-latest]
@@ -148,7 +149,7 @@ jobs:
   build-wheels:
     needs:
       - build-shared-libs
-      - build-shared-libs-cuda
+      # - build-shared-libs-cuda  reduce the pkg size + build times for the preview release
       - build-shared-libs-rocm
     strategy:
       matrix:

From 5225ebea79305af8e02bf9368aa282bc62f9b195 Mon Sep 17 00:00:00 2001
From: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Mon, 30 Sep 2024 17:49:11 -0600
Subject: [PATCH 14/29] refine docs for multi-backend alpha release (#1380)

* refine docs for multi-backend alpha release

* docs: further tweaks to multi-backend alpha docs

* docs: further tweaks to multi-backend alpha docs

* docs: further tweaks to multi-backend alpha docs

* docs: add multi-backend feedback links

* docs: add request for contributions

* docs: small fixes

* docs: small fixes

* docs: add info about `main` continuous build

* docs: further tweaks to multi-backend alpha docs

* docs: further tweaks to multi-backend alpha docs
---
 docs/source/installation.mdx      | 224 ++++++++++++++++++++++++------
 docs/source/non_cuda_backends.mdx |   3 +
 2 files changed, 184 insertions(+), 43 deletions(-)

diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index 2f82c199b..2ac56e03f 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -1,29 +1,45 @@
-# Installation
+# Installation Guide
 
-## CUDA
+Welcome to the installation guide for the `bitsandbytes` library! This document provides step-by-step instructions to install `bitsandbytes` across various platforms and hardware configurations. The library primarily supports CUDA-based GPUs, but the team is actively working on enabling support for additional backends like AMD ROCm, Intel, and Apple Silicon.
 
-bitsandbytes is only supported on CUDA GPUs for CUDA versions **11.0 - 12.5**. However, there's a multi-backend effort under way which is currently in alpha release, check [the respective section below in case you're interested to help us with early feedback](#multi-backend).
+> [!TIP]
+> For a high-level overview of backend support and compatibility, see the [Multi-backend Support](#multi-backend) section.
 
-The latest version of bitsandbytes builds on:
+## Table of Contents
 
-| OS | CUDA | Compiler |
-|---|---|---|
-| Linux | 11.7 - 12.3 | GCC 11.4 |
-|  | 12.4+ | GCC 13.2 |
-| Windows | 11.7 - 12.4 | MSVC 19.38+ (VS2022 17.8.0+) |
+- [CUDA](#cuda)
+  - [Installation via PyPI](#cuda-pip)
+  - [Compile from Source](#cuda-compile)
+- [Multi-backend Support (Alpha Release)](#multi-backend)
+  - [Supported Backends](#multi-backend-supported-backends)
+  - [Pre-requisites](#multi-backend-pre-requisites)
+  - [Installation](#multi-backend-pip)
+  - [Compile from Source](#multi-backend-compile)
+- [PyTorch CUDA Versions](#pytorch-cuda-versions)
 
-> [!TIP]
-> MacOS support is still a work in progress! Subscribe to this [issue](https://github.com/TimDettmers/bitsandbytes/issues/1020) to get notified about discussions and to track the integration progress.
+## CUDA[[cuda]]
 
-For Linux systems, make sure your hardware meets the following requirements to use bitsandbytes features.
+`bitsandbytes` is currently only supported on CUDA GPUs for CUDA versions **11.0 - 12.5**. However, there's an ongoing multi-backend effort under development, which is currently in alpha. If you're interested in providing feedback or testing, check out [the multi-backend section below](#multi-backend).
 
-| **Feature** | **Hardware requirement** |
-|---|---|
-| LLM.int8() | NVIDIA Turing (RTX 20 series, T4) or Ampere (RTX 30 series, A4-A100) GPUs |
-| 8-bit optimizers/quantization | NVIDIA Kepler (GTX 780 or newer) |
+### Supported CUDA Configurations[[cuda-pip]]
+
+The latest version of `bitsandbytes` builds on the following configurations:
+
+| **OS**      | **CUDA Version** | **Compiler**         |
+|-------------|------------------|----------------------|
+| **Linux**   | 11.7 - 12.3      | GCC 11.4             |
+|             | 12.4+            | GCC 13.2             |
+| **Windows** | 11.7 - 12.4      | MSVC 19.38+ (VS2022) |
+
+For Linux systems, ensure your hardware meets the following requirements:
+
+| **Feature**                     | **Hardware Requirement**                                           |
+|---------------------------------|--------------------------------------------------------------------|
+| LLM.int8()                      | NVIDIA Turing (RTX 20 series, T4) or Ampere (RTX 30 series, A4-A100) GPUs |
+| 8-bit optimizers/quantization   | NVIDIA Kepler (GTX 780 or newer)                                    |
 
 > [!WARNING]
-> bitsandbytes >= 0.39.1 no longer includes Kepler binaries in pip installations. This requires manual compilation, and you should follow the general steps and use `cuda11x_nomatmul_kepler` for Kepler-targeted compilation.
+> `bitsandbytes >= 0.39.1` no longer includes Kepler binaries in pip installations. This requires [manual compilation using](#cuda-compile) the `cuda11x_nomatmul_kepler` configuration.
 
 To install from PyPI.
 
@@ -31,14 +47,41 @@ To install from PyPI.
 pip install bitsandbytes
 ```
 
-### Compile from source[[compile]]
+### `pip install` pre-built wheel from latest `main` commit
+
+If you would like to use new feature even before they are officially released and help us test them, feel free to install the wheel directly from our CI (*the wheel links will remain stable!*):
+
+<hfoptions id="OS">
+<hfoption id="Linux">
+
+```
+# Note, if you don't want to reinstall BNBs dependencies, append the `--no-deps` flag!
+pip install --force-reinstall 'https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-0.44.2.dev0-py3-none-manylinux_2_24_x86_64.whl'
+```
+
+</hfoption>
+<hfoption id="Windows">
+
+```
+# Note, if you don't want to reinstall BNBs dependencies, append the `--no-deps` flag!
+pip install --force-reinstall 'https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_multi-backend-refactor/bitsandbytes-0.44.1.dev0-py3-none-macosx_13_1_arm64.whl'
+```
+</hfoption>
+</hfoptions>
+
+### Compile from source[[cuda-compile]]
+
+> [!TIP]
+> Don't hesitate to compile from source! The process is pretty straight forward and resilient. This might be needed for older CUDA versions or other less common configurations, which we don't support out of the box due to package size.
 
-For Linux and Windows systems, you can compile bitsandbytes from source. Installing from source allows for more build options with different CMake configurations.
+For Linux and Windows systems, compiling from source allows you to customize the build configurations. See below for detailed platform-specific instructions (see the `CMakeLists.txt` if you want to check the specifics and explore some additional options):
 
 <hfoptions id="source">
 <hfoption id="Linux">
 
-To compile from source, you need CMake >= **3.22.1** and Python >= **3.8** installed. Make sure you have a compiler installed to compile C++ (gcc, make, headers, etc.). For example, to install a compiler and CMake on Ubuntu:
+To compile from source, you need CMake >= **3.22.1** and Python >= **3.8** installed. Make sure you have a compiler installed to compile C++ (`gcc`, `make`, headers, etc.).
+
+For example, to install a compiler and CMake on Ubuntu:
 
 ```bash
 apt-get install -y build-essential cmake
@@ -48,16 +91,16 @@ You should also install CUDA Toolkit by following the [NVIDIA CUDA Installation
 
 Refer to the following table if you're using another CUDA Toolkit version.
 
-| CUDA Toolkit | GCC |
-|---|---|
-| >= 11.4.1 | >= 11 |
-| >= 12.0 | >= 12 |
-| >= 12.4 | >= 13 |
+| CUDA Toolkit |  GCC  |
+|--------------|-------|
+| >= 11.4.1    | >= 11 |
+| >= 12.0      | >= 12 |
+| >= 12.4      | >= 13 |
 
 Now to install the bitsandbytes package from source, run the following commands:
 
 ```bash
-git clone https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
+git clone https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
 pip install -r requirements-dev.txt
 cmake -DCOMPUTE_BACKEND=cuda -S .
 make
@@ -81,7 +124,7 @@ Refer to the following table if you're using another CUDA Toolkit version.
 | >= 11.6 | 19.30+ (VS2022) |
 
 ```bash
-git clone https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
+git clone https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
 pip install -r requirements-dev.txt
 cmake -DCOMPUTE_BACKEND=cuda -S .
 cmake --build . --config Release
@@ -93,7 +136,7 @@ Big thanks to [wkpark](https://github.com/wkpark), [Jamezo97](https://github.com
 </hfoption>
 </hfoptions>
 
-### PyTorch CUDA versions
+### PyTorch CUDA versions[[pytorch-cuda-versions]]
 
 Some bitsandbytes features may need a newer CUDA version than the one currently supported by PyTorch binaries from Conda and pip. In this case, you should follow these instructions to load a precompiled bitsandbytes binary.
 
@@ -105,7 +148,7 @@ Some bitsandbytes features may need a newer CUDA version than the one currently
 Then locally install the CUDA version you need with this script from bitsandbytes:
 
 ```bash
-wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh
+wget https://raw.githubusercontent.com/bitsandbytes-foundation/bitsandbytes/main/install_cuda.sh
 # Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH
 #   CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122, 123, 124, 125}
 #   EXPORT_TO_BASH in {0, 1} with 0=False and 1=True
@@ -134,28 +177,62 @@ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/YOUR_USERNAME/local/cuda-11.7
 
 3. Now when you launch bitsandbytes with these environment variables, the PyTorch CUDA version is overridden by the new CUDA version (in this example, version 11.7) and a different bitsandbytes library is loaded.
 
-## Multi-backend[[multi-backend]]
+## Multi-backend Support (Alpha Release)[[multi-backend]]
 
 > [!TIP]
-> This functionality is currently in preview and therefore not yet production-ready! Please reference [this guide](./non_cuda_backends) for more in-depth information about the different backends and their current status.
+> This functionality is currently in preview and not yet production-ready. We very much welcome community feedback, contributions and leadership on topics like Apple Silicon as well as other less common accellerators! For more information, see [this guide on multi-backend support](./non_cuda_backends).
+
+**Link to give us feedback** (bugs, install issues, perf results, requests, etc.)**:**
+
+<hfoptions id="platform">
+<hfoption id="ROCm">
+
+[**Multi-backend refactor: Alpha release (AMD ROCm ONLY)**](https://github.com/bitsandbytes-foundation/bitsandbytes/discussions/1339)
+
+</hfoption>
+<hfoption id="Intel CPU+GPU">
+
+[**Multi-backend refactor: Alpha release (INTEL ONLY)**](https://github.com/bitsandbytes-foundation/bitsandbytes/discussions/1338)
+
+</hfoption>
+<hfoption id="Apple Silicon / Metal (MPS)">
 
-Please follow these steps to install bitsandbytes with device-specific backend support other than CUDA:
+[**Github Discussion space on coordinating the kickoff of MPS backend development**](https://github.com/bitsandbytes-foundation/bitsandbytes/discussions/1340)
 
-### Pip install the pre-built wheel (recommended for most)
+</hfoption>
+</hfoptions>
 
-WIP (will be added in the coming days)
+### Supported Backends[[multi-backend-supported-backends]]
 
-### Compilation
+| **Backend** | **Supported Versions** | **Python versions** | **Architecture Support** | **Status** |
+|-------------|------------------------|---------------------------|-------------------------|------------|
+| **AMD ROCm** | 6.1+                   | 3.10+                     | minimum CDNA - `gfx90a`, RDNA - `gfx1100` | Alpha      |
+| **Apple Silicon (MPS)** | WIP                        | 3.10+                     | M1/M2 chips                    | Planned    |
+| **Intel CPU** | v2.4.0+ (`ipex`)         | 3.10+                     | Intel CPU | Alpha |
+| **Intel GPU** | v2.4.0+ (`ipex`)         | 3.10+                     | Intel GPU | Experimental |
+
+For each supported backend, follow the respective instructions below:
+
+### Pre-requisites[[multi-backend-pre-requisites]]
+
+To use bitsandbytes non-CUDA backends, be sure to install:
+
+```
+pip install "transformers>=4.45.1"
+```
 
 <hfoptions id="backend">
 <hfoption id="AMD ROCm">
 
-#### AMD GPU
-
-bitsandbytes is fully supported from ROCm 6.1 onwards (currently in alpha release).
+> [!WARNING]
+> Pre-compiled binaries are only built for ROCm versions `6.1.0`/`6.1.1`/`6.1.2`/`6.2.0` and `gfx90a`, `gfx942`, `gfx1100` GPU architectures. [Find the pip install instructions here](#multi-backend-pip).
+>
+> Other supported versions that don't come with pre-compiled binaries [can be compiled for with these instructions](#multi-backend-compile).
+>
+> **Windows is not supported for the ROCm backend**; also not WSL2 to our knowledge.
 
 > [!TIP]
-> If you would like to install ROCm and PyTorch on bare metal, skip Docker steps and refer to our official guides at [ROCm installation overview](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/install-overview.html#rocm-install-overview) and [Installing PyTorch for ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/3rd-party/pytorch-install.html#using-wheels-package) (Step 3 of wheels build for quick installation). Please make sure to get PyTorch wheel for the installed ROCm version.
+> If you would like to install ROCm and PyTorch on bare metal, skip the Docker steps and refer to ROCm's official guides at [ROCm installation overview](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/install-overview.html#rocm-install-overview) and [Installing PyTorch for ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/3rd-party/pytorch-install.html#using-wheels-package) (Step 3 of wheels build for quick installation). Special note: please make sure to get the respective ROCm-specific PyTorch wheel for the installed ROCm version, e.g. `https://download.pytorch.org/whl/nightly/rocm6.2/`!
 
 ```bash
 # Create a docker container with latest ROCm image, which includes ROCm libraries
@@ -165,9 +242,70 @@ apt-get update && apt-get install -y git && cd home
 
 # Install pytorch compatible with above ROCm version
 pip install torch --index-url https://download.pytorch.org/whl/rocm6.1/
+```
 
-# Install bitsandbytes from PyPI
-# (This is supported on Ubuntu 22.04, Python 3.10, ROCm 6.1.0/6.1.1/6.1.2/6.2.0 and gpu arch - gfx90a, gfx942, gfx1100
+</hfoption>
+<hfoption id="Intel CPU + GPU">
+
+Compatible hardware and functioning `import intel_extension_for_pytorch as ipex` capable environment with Python `3.10` as the minimum requirement.
+
+Please refer to [the official Intel installations instructions](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=cpu&version=v2.4.0%2bcpu&os=linux%2fwsl2) for guidance on how to pip install the necessary `intel_extension_for_pytorch` dependency.
+
+</hfoption>
+<hfoption id="Apple Silicon (MPS)">
+
+> [!TIP]
+> Apple Silicon support is still a WIP. Please visit and write us in [this Github Discussion space on coordinating the kickoff of MPS backend development](https://github.com/bitsandbytes-foundation/bitsandbytes/discussions/1340) and coordinate a community-led effort to implement this backend.
+
+</hfoption>
+</hfoptions>
+
+### Installation
+
+You can install the pre-built wheels for each backend, or compile from source for custom configurations.
+
+#### Pre-built Wheel Installation (recommended)[[multi-backend-pip]]
+
+<hfoptions id="platform">
+<hfoption id="Linux">
+
+```
+# Note, if you don't want to reinstall BNBs dependencies, append the `--no-deps` flag!
+pip install --force-reinstall 'https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_multi-backend-refactor/bitsandbytes-0.44.1.dev0-py3-none-manylinux_2_24_x86_64.whl'
+```
+
+</hfoption>
+<hfoption id="Windows">
+
+```
+# Note, if you don't want to reinstall BNBs dependencies, append the `--no-deps` flag!
+pip install --force-reinstall 'https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_multi-backend-refactor/bitsandbytes-0.44.1.dev0-py3-none-win_amd64.whl'
+```
+
+</hfoption>
+<hfoption id="Mac">
+
+> [!WARNING]
+> bitsandbytes does not yet support Apple Silicon / Metal with a dedicated backend. However, the build infrastructure is in place and the below pip install will eventually provide Apple Silicon support as it becomes available on the `multi-backend-refactor` branch based on community contributions.
+
+```
+# Note, if you don't want to reinstall BNBs dependencies, append the `--no-deps` flag!
+pip install --force-reinstall 'https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_multi-backend-refactor/bitsandbytes-0.44.1.dev0-py3-none-macosx_13_1_arm64.whl'
+```
+
+</hfoption>
+</hfoptions>
+
+#### Compile from Source[[multi-backend-compile]]
+
+<hfoptions id="backend">
+<hfoption id="AMD ROCm">
+
+#### AMD GPU
+
+bitsandbytes is fully supported from ROCm 6.1 onwards (currently in alpha release).
+
+```bash
 # Please install from source if your configuration doesn't match with these)
 pip install bitsandbytes
 
@@ -195,10 +333,10 @@ pip install -e .   # `-e` for "editable" install, when developing BNB (otherwise
 
 Similar to the CUDA case, you can compile bitsandbytes from source for Linux and Windows systems.
 
-The below commands are for Linux. For installing on Windows, please adapt the below commands according to the same pattern as described [the section above on compiling from source under the Windows tab](#compile).
+The below commands are for Linux. For installing on Windows, please adapt the below commands according to the same pattern as described [the section above on compiling from source under the Windows tab](#cuda-compile).
 
 ```
-git clone --depth 1 -b multi-backend-refactor https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
+git clone --depth 1 -b multi-backend-refactor https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
 pip install intel_extension_for_pytorch
 pip install -r requirements-dev.txt
 cmake -DCOMPUTE_BACKEND=cpu -S .
diff --git a/docs/source/non_cuda_backends.mdx b/docs/source/non_cuda_backends.mdx
index fc7c6ac27..728606b7b 100644
--- a/docs/source/non_cuda_backends.mdx
+++ b/docs/source/non_cuda_backends.mdx
@@ -1,5 +1,8 @@
 # Multi-backend support (non-CUDA backends)
 
+> [!Tip]
+> If you feel these docs need some additional info, please consider submitting a PR or respectfully request the missing info in one of the below mentioned Github discussion spaces.
+
 As part of a recent refactoring effort, we will soon offer official multi-backend support. Currently, this feature is available in a preview alpha release, allowing us to gather early feedback from users to improve the functionality and identify any bugs.
 
 At present, the Intel CPU and AMD ROCm backends are considered fully functional. The Intel XPU backend has limited functionality and is less mature.

From e6cc10934c72f1ddc99944331da6a95673a605d6 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Tue, 1 Oct 2024 14:01:09 +0000
Subject: [PATCH 15/29] docs: remove 2 obsolete lines

---
 docs/source/installation.mdx | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index 2ac56e03f..609865436 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -306,9 +306,6 @@ pip install --force-reinstall 'https://github.com/bitsandbytes-foundation/bitsan
 bitsandbytes is fully supported from ROCm 6.1 onwards (currently in alpha release).
 
 ```bash
-# Please install from source if your configuration doesn't match with these)
-pip install bitsandbytes
-
 # Install bitsandbytes from source
 # Clone bitsandbytes repo, ROCm backend is currently enabled on multi-backend-refactor branch
 git clone --depth 1 -b multi-backend-refactor https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/

From cd3cb6812dd4f2579f04ff51efa0662cc0467c63 Mon Sep 17 00:00:00 2001
From: pnunna93 <104791500+pnunna93@users.noreply.github.com>
Date: Wed, 16 Oct 2024 15:51:32 -0500
Subject: [PATCH 16/29] Remove depth option in installation steps (#1395)

* Add build job for rocm

* Add rocm build script

* Copy shared obj file into output_dir

* upload build artifacts and enable wheels build

* Remove cuda build temporarily

* Add ROCm version to .so filename

* Add rocm_version to whls build

* Revert "Remove cuda build temporarily"

This reverts commit 1413c5f3a2aed51140b86daa8ee9283c67cce738.

* Add rocm_version env var

* Remove thrush header files

* Print node info

* print cuda node info

* Revert "print cuda node info"

This reverts commit cdb209a2eb896d9c4166f53e9b2aa580c10e42c0.

* Revert "Print node info"

This reverts commit 7e9a65c33f66fffcb14ee2438170718777c06022.

* Add rocm arch to compile command

* Rename .so files to rocm

* Update default gpu arch

* Skip cpu based igemmlt int tests on ROCm

* Update Documentation

* Update upstream repo name

* Update docs

* Update string format

Co-authored-by: Aarni Koskela <akx@iki.fi>

* Remove pre-release option for torch install

* Update pytorch install path

Co-authored-by: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>

* Add messages for Heuristics error

* Remove toolcache for disk space

* print disk usage

* Clean disk space for linux

* Fix for ubuntu

* Add sudo for apt clean

* Update clean up disk list

* remove disk usage print

* Add BNB_BACKEND variable

* Update diagnostic functions for ROCm

* Fix tuple error

* Fix library detection bug for recursive and symlink cases

* fix pre-commit errors

* Remove recursive path lib search

* Create function for runtime lib patterns

* Update logger format

Co-authored-by: Aarni Koskela <akx@iki.fi>

* Update error reporting

Co-authored-by: Aarni Koskela <akx@iki.fi>

* Remove commented code

Co-authored-by: Aarni Koskela <akx@iki.fi>

* Update error reporting

Co-authored-by: Aarni Koskela <akx@iki.fi>

* Update error reporting

* Create hip diagnostics functions

* Fix Typo

* Fix pre-commit checks

* Enable 6.2 build

* Skip gemv 4 bit cpu test

* Update documentation for 6.2.0 pip install

* Update README for default branch change

* Fix typo

* Sync README with upstream

* Remove depth

---------

Co-authored-by: Aarni Koskela <akx@iki.fi>
Co-authored-by: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>
Co-authored-by: Aswin John Mathews <81309834+amathews-amd@users.noreply.github.com>
Co-authored-by: root <root@banff-cyxtera-s78-4.ctr.dcgpu>
---
 docs/source/installation.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index 609865436..d1acb2cd6 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -308,7 +308,7 @@ bitsandbytes is fully supported from ROCm 6.1 onwards (currently in alpha releas
 ```bash
 # Install bitsandbytes from source
 # Clone bitsandbytes repo, ROCm backend is currently enabled on multi-backend-refactor branch
-git clone --depth 1 -b multi-backend-refactor https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
+git clone -b multi-backend-refactor https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
 
 # Install dependencies
 pip install -r requirements-dev.txt

From cd73601fcb70f83f663b71c0169548facba3cd06 Mon Sep 17 00:00:00 2001
From: Huazhong Ji <hzji210@gmail.com>
Date: Wed, 20 Nov 2024 20:24:45 +0800
Subject: [PATCH 17/29] Fix issue that no valid semantic version tag found when
 installing bitsandbytes from source in personal repo (#1419)

---
 setup.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 2b1c1aff3..e8d3f547c 100644
--- a/setup.py
+++ b/setup.py
@@ -27,7 +27,8 @@ def get_latest_semver_tag():
     tags = subprocess.check_output(["git", "tag"], text=True).splitlines()
     semver_tags = [tag for tag in tags if tag.count(".") == 2 and all(part.isdigit() for part in tag.split("."))]
     if not semver_tags:
-        raise ValueError("No valid semantic version tags found")
+        print("No valid semantic version tags found, use 0.0.1 defaultly")
+        semver_tags = ["0.0.1"]
     return sorted(semver_tags, key=lambda s: list(map(int, s.split("."))))[-1]
 
 

From b2ac4232999648bffb9c2a8b1a997ddd1029eadf Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Fri, 29 Nov 2024 22:48:17 +0800
Subject: [PATCH 18/29] Enable XPU and optimize cpu/xpu op (#1418)

* enable new ipex API

ipex weight is 4D so we cannot transpose

fix dequant

check require grad

* use ipex op in backward

* enable backward

* Multi backend refactor (#8)

* AMD: Clarify diagnostic messages; free up disk space for CI build

* Add build job for rocm

* Add rocm build script

* Copy shared obj file into output_dir

* upload build artifacts and enable wheels build

* Remove cuda build temporarily

* Add ROCm version to .so filename

* Add rocm_version to whls build

* Revert "Remove cuda build temporarily"

This reverts commit 1413c5f3a2aed51140b86daa8ee9283c67cce738.

* Add rocm_version env var

* Remove thrush header files

* Print node info

* print cuda node info

* Revert "print cuda node info"

This reverts commit cdb209a2eb896d9c4166f53e9b2aa580c10e42c0.

* Revert "Print node info"

This reverts commit 7e9a65c33f66fffcb14ee2438170718777c06022.

* Add rocm arch to compile command

* Rename .so files to rocm

* Update default gpu arch

* Skip cpu based igemmlt int tests on ROCm

* Update Documentation

* Update upstream repo name

* Update docs

* Update string format

Co-authored-by: Aarni Koskela <akx@iki.fi>

* Remove pre-release option for torch install

* Update pytorch install path

Co-authored-by: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>

* Add messages for Heuristics error

* Remove toolcache for disk space

* print disk usage

* Clean disk space for linux

* Fix for ubuntu

* Add sudo for apt clean

* Update clean up disk list

* remove disk usage print

* Add BNB_BACKEND variable

* Update diagnostic functions for ROCm

* Fix tuple error

* Fix library detection bug for recursive and symlink cases

* fix pre-commit errors

* Remove recursive path lib search

* Create function for runtime lib patterns

* Update logger format

Co-authored-by: Aarni Koskela <akx@iki.fi>

* Update error reporting

Co-authored-by: Aarni Koskela <akx@iki.fi>

* Remove commented code

Co-authored-by: Aarni Koskela <akx@iki.fi>

* Update error reporting

Co-authored-by: Aarni Koskela <akx@iki.fi>

* Update error reporting

* Create hip diagnostics functions

* Fix Typo

* Fix pre-commit checks

---------

Co-authored-by: Aarni Koskela <akx@iki.fi>
Co-authored-by: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>

* check grad before using ipex (#1358)

* Enable packaging for ROCm 6.2 (#1367)

* Enable 6.2 build

* Update documentation for 6.2.0 pip install

* Update for VS2022 17.11 compatibility with CUDA < 12.4 (#1341)

* Update for VS2022 17.11 compatibility with CUDA < 12.4

* Try again

* Enable continuous releases for multi-backend-refactor branch

* Update release workflow

* Publish continuous release for multi-backend

* continuous release: revert wheel renaming due to install err

* Revert "continuous release: revert wheel renaming due to install err"

This reverts commit 0a2b5392ff079645fdc9ff887f80d327f9e874f7.

* add dynamic tag-based versioning + git hash for dev vers

* docs: update w/ changes from `main`

* get tags for dynamic versioning

* fine-tune continuous release params

* reduce the pkg size + build times for the preview release

* refine docs for multi-backend alpha release (#1380)

* refine docs for multi-backend alpha release

* docs: further tweaks to multi-backend alpha docs

* docs: further tweaks to multi-backend alpha docs

* docs: further tweaks to multi-backend alpha docs

* docs: add multi-backend feedback links

* docs: add request for contributions

* docs: small fixes

* docs: small fixes

* docs: add info about `main` continuous build

* docs: further tweaks to multi-backend alpha docs

* docs: further tweaks to multi-backend alpha docs

* docs: remove 2 obsolete lines

---------

Co-authored-by: pnunna93 <104791500+pnunna93@users.noreply.github.com>
Co-authored-by: Aarni Koskela <akx@iki.fi>
Co-authored-by: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>
Co-authored-by: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>

* Revert "enable backward"

This reverts commit cd7bf2145807932c8a8a499ddb6bb14e47eb24fc.

* Revert "use ipex op in backward"

This reverts commit b8df1aad9414a669e188678b36be304400987a72.

* fix finetune

* check training

* fix gemv check

* reformat

* avoid double quant in backward if not needed

* Zh/xpu support (#9)

* Add xpu support

* Add xpu support for int8

* Add xpu dequant kernel support

* update code

* remove debug comments

* remove redundant comments

* Add xpu integration for woqlinear

* correct the comments

* Update cpu_xpu_common.py

---------

Co-authored-by: zhuhong61 <hong.zhu@intel.com>
Co-authored-by: zhuhong61 <95205772+zhuhong61@users.noreply.github.com>

* avoid import triton if CPU and XPU backend

* fix setup in docker without git config

* xpu do not support compile for now

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* update xpu

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* update 4bit compute dtype

* fix xpu int8 path

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* optimize 4bit dequant

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* fix xpu dequant

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* add empty cache in each xpu op

* add nf4 dequant ipex kernel

* fix dequant 4bit op

* empty cache has negative effect on 4bit gemv

* fix xpu save

* fix save

* xpu use float16 default

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* rm empty cache as it cause slower perf

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* fix xpu save

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* fix 8bit int8 param device

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* fix 8bit int8 param device

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* fix 8bit int8 param device

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* fix 8bit int8 param device

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* fix format

* update readme for Intel CPU and XPU do not need make csrc codes

* fix format

* fix import

---------

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
Co-authored-by: pnunna93 <104791500+pnunna93@users.noreply.github.com>
Co-authored-by: Aarni Koskela <akx@iki.fi>
Co-authored-by: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>
Co-authored-by: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Co-authored-by: zhuhong61 <hong.zhu@intel.com>
Co-authored-by: zhuhong61 <95205772+zhuhong61@users.noreply.github.com>
---
 bitsandbytes/__init__.py                |  8 ++-
 bitsandbytes/autograd/_functions.py     | 17 +++--
 bitsandbytes/backends/cpu_xpu_common.py | 70 ++++++++++--------
 bitsandbytes/backends/xpu.py            | 95 ++++++++++++++++++++++---
 bitsandbytes/functional.py              | 22 +++---
 bitsandbytes/nn/__init__.py             | 16 +++--
 bitsandbytes/nn/modules.py              | 66 ++++++++++++-----
 bitsandbytes/utils.py                   | 41 +++++++----
 docs/source/installation.mdx            |  6 +-
 docs/source/non_cuda_backends.mdx       |  6 +-
 10 files changed, 246 insertions(+), 101 deletions(-)
 mode change 100644 => 100755 bitsandbytes/nn/modules.py

diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
index 25ec8a79a..c705137c0 100644
--- a/bitsandbytes/__init__.py
+++ b/bitsandbytes/__init__.py
@@ -17,11 +17,10 @@
     matmul_cublas,
     mm_cublas,
 )
-from .backends import register_backend
+from .backends import backends, register_backend
 from .backends.cpu import CPUBackend
 from .backends.npu import NPUBackend
 from .cextension import lib
-from .nn import modules
 
 features = {"multi_backend"}
 supported_torch_devices = {
@@ -64,6 +63,11 @@
 if hasattr(torch, "npu") and torch.npu.is_available():
     register_backend("npu", NPUBackend())
 
+
+# import module after decided backends
+if backends:
+    from .nn import modules
+
 # TODO: Other potential backends:
 # XLA - Google TPU / PJRT runtime
 # HPU - Habana / Intel Gaudi
diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
index 59e26ad09..9765def05 100644
--- a/bitsandbytes/autograd/_functions.py
+++ b/bitsandbytes/autograd/_functions.py
@@ -221,7 +221,7 @@ def backward(ctx, grad_output):
 
 def supports_igemmlt(device: torch.device) -> bool:
     """check if this device supports the optimized int8 kernel"""
-    if device == torch.device("cpu"):
+    if device == torch.device("cpu") or torch.device("xpu"):
         return True
     if torch.version.hip:
         return False if BNB_HIP_VERSION < 601 else True
@@ -463,7 +463,9 @@ def backward(ctx, grad_output):
         if len(grad_output.shape) == 3:
             grad_output = grad_output.reshape(-1, grad_output.shape[-1]).contiguous()
 
-        Cgrad, Cgradt, SCgrad, SCgradt, coo_tensor = F.double_quant(grad_output.to(torch.float16))
+        Cgrad, Cgradt, SCgrad, SCgradt, coo_tensor = None, None, None, None, None
+        if req_gradB or (req_gradA and state.CBt):
+            Cgrad, Cgradt, SCgrad, SCgradt, coo_tensor = F.double_quant(grad_output.to(torch.float16))
         if req_gradB:
             CxAt, SAt = F.transform(CAt, formatB, transpose=True)
             C32grad, Sgrad = F.transform(Cgradt, "col32", transpose=True)
@@ -575,8 +577,15 @@ def matmul_4bit(
     bias=None,
 ):
     assert quant_state is not None
-    if (A.numel() == A.shape[-1] or A.device.type == "cpu") and A.requires_grad == False:
-        # CPU backend does not require A to be a vector
+    if A.device.type in ("cpu", "xpu") and A.requires_grad == False:
+        if getattr(quant_state, "ipex", False):
+            out = F.gemv_4bit(A, B.t(), out, state=quant_state)
+            if bias is not None:
+                out += bias
+            return out
+        else:
+            return MatMul4Bit.apply(A, B, out, bias, quant_state)
+    elif A.numel() == A.shape[-1] and A.requires_grad == False:
         if A.shape[-1] % quant_state.blocksize != 0:
             warn(
                 f"Some matrices hidden dimension is not a multiple of {quant_state.blocksize} and efficient inference kernels are not supported for these (slow). Matrix input size found: {A.shape}",
diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index 0d865b541..d2e0c2593 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -15,6 +15,7 @@
 
     ipex_cpu = ipex if ipex._C._has_cpu() else None
     ipex_xpu = ipex if ipex._C._has_xpu() else None
+    ipex_cpu_only = ipex._C._has_cpu() and (not ipex._C._has_xpu())
 except BaseException:
     ipex_cpu = None
     ipex_xpu = None
@@ -55,7 +56,7 @@ def _ipex_xpu_version_prereq(major, minor):
 
 def _maybe_torch_compile(func):
     # torch.compile requires g++ and pytorch >= 2.0
-    if gxx_available and _torch_version_prereq(2, 0):
+    if gxx_available and _torch_version_prereq(2, 0) and not ipex_xpu:
         options = {}
         # fx_graph_cache requires pytorch >= 2.2
         if _torch_version_prereq(2, 2):
@@ -181,7 +182,7 @@ def igemmlt_impl(A, B, SA=None, SB=None, out=None, Sout=None, dtype=torch.int32)
     A_reshaped = A.reshape(m, k)
 
     # torch._int_mm is available on CPU since torch 2.4
-    if _torch_version_prereq(2, 4):
+    if _torch_version_prereq(2, 4) and A.device.type == "cpu":
         C = torch._int_mm(A_reshaped, B.T).to(dtype)
     else:
         C = torch.matmul(A_reshaped.float(), B.t().float()).to(dtype)
@@ -233,8 +234,10 @@ def mm_dequant_impl(
         out_shape = (out_shape[0] * out_shape[1], out_shape[2])
 
     if compute_dtype not in [torch.float32, torch.bfloat16]:
-        warnings.warn(f"mm_dequant_{A.device}: compute_dtype {compute_dtype} is not supported, will use float instead")
-        compute_dtype = torch.float32
+        warnings.warn(
+            f"mm_dequant_{A.device}: compute_dtype {compute_dtype} is not supported, will use bfloat16 instead"
+        )
+        compute_dtype = torch.bfloat16
     A_reshaped = A.reshape(out_shape).to(compute_dtype)
     row_stats = row_stats.reshape(-1).unsqueeze(-1).to(compute_dtype)
     col_stats = col_stats.reshape(-1).unsqueeze(0).to(compute_dtype)
@@ -342,7 +345,7 @@ def quantize_4bit_impl(
         scaled_A_rem = torch.clamp(A_reshaped[n - rem :] * (1 / absmax[-1]), -1, 1)
         scaled_A = torch.cat([scaled_A, scaled_A_rem], dim=0)
     # map [-1, 1] to nf4/fp4
-    out_uint8 = torch.empty(scaled_A.shape, dtype=torch.uint8)
+    out_uint8 = torch.empty(scaled_A.shape, dtype=torch.uint8, device=A.device)
     if quant_type == "nf4":
         for i in range(len(NF4_QUANT_TABLE)):
             out_uint8[scaled_A > NF4_QUANT_TABLE[i]] = i
@@ -408,7 +411,6 @@ def dequantize_4bit_impl(
     torch.Tensor:
         Dequantized tensor.
     """
-
     if A.shape[0] == 1:
         transpose = False
         A = A.squeeze(0)
@@ -438,23 +440,18 @@ def dequantize_4bit_impl(
     if quant_state.nested:
         raise NotImplementedError("bnb_4bit_use_double_quant is not supported yet for CPU/XPU")
 
-    if ipex_cpu and _ipex_cpu_version_prereq(2, 3) and hasattr(quant_state, "op_context"):
-        assert quant_state.op_context is not None
-        A = quant_state.op_context.to_public(quant_state.op_context.get_weight())
-        A = A.reshape(-1)
-        absmax = quant_state.op_context.get_scales().reshape(-1)
-
-    if out is None:
-        out = torch.empty(quant_state.shape, dtype=quant_state.dtype, device=A.device)
+    if ipex_cpu_only and _ipex_cpu_version_prereq(2, 5) and getattr(quant_state, "ipex", False):
+        A = torch.ops.ipex_prepack.woq_linear_unpack_weight(A, "nf4", quant_state.shape, 2)
+        quant_state.ipex = False
 
-    n = out.numel()
     # Map nf4 to [-1, 1]
-    out_uint8 = torch.empty(A.size(0) * 2, dtype=torch.uint8, device=A.device)
-    out_uint8[::2] = A.bitwise_and(0xF)
-    out_uint8[1::2] = A.bitwise_right_shift(4)
-    out_dq = torch.empty(out_uint8.shape).to(quant_state.dtype)
-    for i in range(len(quant_state.code)):
-        out_dq[out_uint8 == i] = quant_state.code[i]
+    out_dq = torch.empty(A.size(0) * 2, dtype=torch.int32, device=A.device)
+    n = out_dq.numel()
+    out_dq[::2] = A & 0xF
+    out_dq[1::2] = A >> 4
+    # quant_state.code is fp32, cast to quant_state dtype to avoid the mismatch issue
+    quant_state.code = quant_state.code.to(quant_state.dtype)
+    out_dq = quant_state.code[out_dq]
 
     # Apply scales
     if out_dq.numel() != n:
@@ -464,12 +461,17 @@ def dequantize_4bit_impl(
     blocks += 1 if n % blocksize > 0 else 0
     rem = n % blocksize
     has_rem = rem > 0
-    out_reshaped = out.reshape(-1)
-    out_reshaped[: n - rem] = (out_dq[: n - rem].view(-1, blocksize) * absmax[: blocks - has_rem].view(-1, 1)).reshape(
-        -1
-    )
+
     if has_rem:
+        if out is None:
+            out = torch.empty(quant_state.shape, dtype=quant_state.dtype, device=A.device)
+        out_reshaped = out.reshape(-1)
+        out_reshaped[: n - rem] = (
+            out_dq[: n - rem].view(-1, blocksize) * absmax[: blocks - has_rem].view(-1, 1)
+        ).reshape(-1)
         out_reshaped[n - rem :] = out_dq[n - rem :] * absmax[-1]
+    else:
+        out = (out_dq.view(-1, blocksize) * absmax.view(-1, 1)).reshape(quant_state.shape).to(quant_state.dtype)
 
     # take transpose here because weight is transposed (again) for computation
     if transpose:
@@ -510,9 +512,21 @@ def gemm_4bit_impl(
     torch.Tensor:
         GEMM output tensor.
     """
-    if ipex_cpu and _ipex_cpu_version_prereq(2, 3) and hasattr(state, "op_context"):
-        assert state.op_context is not None
-        output = torch.ops.torch_ipex.ipex_woq_linear(A, state.op_context.get_data_handle())
+    if getattr(state, "ipex", False):
+        output = torch.ops.torch_ipex.woq_linear(
+            A,
+            B,
+            "nf4",
+            state.shape,
+            state.new_scales,
+            state.new_zeros,
+            None,
+            None,
+            state.blocksize,
+            ipex_cpu.quantization.WoqLowpMode.BF16,
+            1,
+            state.compensation,
+        )
     else:
         dqB = dequantize_4bit_impl(B, state, blocksize=state.blocksize).t()
         output = torch.matmul(A, dqB.to(A.dtype))
diff --git a/bitsandbytes/backends/xpu.py b/bitsandbytes/backends/xpu.py
index 3976c4d5a..bc13963e6 100644
--- a/bitsandbytes/backends/xpu.py
+++ b/bitsandbytes/backends/xpu.py
@@ -5,9 +5,36 @@
 from bitsandbytes.utils import QuantState
 
 from .base import Backend
+from .cpu_xpu_common import (
+    dequantize_4bit_impl,
+    double_quant_impl,
+    gemm_4bit_impl,
+    igemmlt_impl,
+    mm_dequant_impl,
+    quantize_4bit_impl,
+)
+
+Tensor = torch.Tensor
+
+
+def assert_on_xpu(tensors):
+    on_xpu = True
+    for t in tensors:
+        if t is None:
+            continue  # NULL pointers are fine
+        on_xpu &= t.device.type == "xpu"
+    if not on_xpu:
+        raise TypeError(
+            "All input tensors need to be on XPU, but found some tensors to not be on XPU:\n"
+            f" {[(t.shape, t.device) if isinstance(t, Tensor) else None for t in tensors]}"
+        )
+    return on_xpu
 
 
 class XPUBackend(Backend):
+    mm_dequant_compute_dtype = torch.bfloat16
+    mm_dequant_output_dtype = torch.bfloat16
+
     def double_quant(
         self,
         A: torch.Tensor,
@@ -17,7 +44,9 @@ def double_quant(
         out_row: Optional[torch.Tensor] = None,
         threshold=0.0,
     ):
-        raise NotImplementedError
+        assert_on_xpu([A, col_stats, row_stats, out_col, out_row])
+        output = double_quant_impl(A, col_stats, row_stats, out_col, out_row, threshold)
+        return output
 
     def transform(
         self,
@@ -29,7 +58,23 @@ def transform(
         state: Optional[Tuple[torch.Size, str]] = None,
         ld=None,
     ):
-        raise NotImplementedError
+        """
+        Transform tensor A to to_order. It is originally designed for CUDA.
+        For XPU, it returns the original tensor if transpose=False.
+        Otherwise, it returns the transpose of A
+        """
+        assert_on_xpu([A, out])
+        if transpose:
+            if out is not None:
+                out.copy_(A.T)
+            else:
+                out = A.T
+        else:
+            if out is not None:
+                out.copy_(A)
+            else:
+                out = A
+        return out, state
 
     def igemmlt(
         self,
@@ -41,7 +86,9 @@ def igemmlt(
         Sout: Optional[Tuple[torch.Size, str]] = None,
         dtype=torch.int32,
     ) -> Union[torch.Tensor, Tuple[Optional[Tuple[torch.Tensor, Tuple[torch.Size, str]]]]]:
-        raise NotImplementedError
+        assert_on_xpu([A, B])
+        output = igemmlt_impl(A, B, SA, SB, out, Sout, dtype)
+        return output
 
     def mm_dequant(
         self,
@@ -54,7 +101,20 @@ def mm_dequant(
         new_col_stats: Optional[torch.Tensor] = None,
         bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        raise NotImplementedError
+        assert_on_xpu([A, row_stats, col_stats, out, bias])
+        output = mm_dequant_impl(
+            A,
+            quant_state,
+            row_stats,
+            col_stats,
+            out,
+            new_row_stats,
+            new_col_stats,
+            bias,
+            self.mm_dequant_compute_dtype,
+            self.mm_dequant_output_dtype,
+        )
+        return output
 
     def extract_outliers(
         self,
@@ -62,7 +122,9 @@ def extract_outliers(
         SA: Tuple[torch.Size, str],
         idx: torch.Tensor,
     ) -> torch.Tensor:
-        raise NotImplementedError
+        assert_on_xpu([A])
+        output = A[:, idx].contiguous()
+        return output
 
     def quantize_4bit(
         self,
@@ -74,7 +136,12 @@ def quantize_4bit(
         quant_type: Literal["fp4", "nf4"] = "fp4",
         quant_storage=torch.uint8,
     ) -> Tuple[torch.Tensor, QuantState]:
-        raise NotImplementedError
+        if blocksize is None:
+            blocksize = 64
+        assert_on_xpu([A, absmax, out])
+        assert quant_storage == torch.uint8, "XPU backend only supports uint8 quant_storage"
+        output = quantize_4bit_impl(A, absmax, out, blocksize, compress_statistics, quant_type)
+        return output
 
     def dequantize_4bit(
         self,
@@ -85,7 +152,15 @@ def dequantize_4bit(
         blocksize: int = 64,
         quant_type: Literal["fp4", "nf4"] = "fp4",
     ) -> torch.Tensor:
-        raise NotImplementedError
+        if blocksize is None:
+            blocksize = 64
+        assert_on_xpu([A, absmax, out])
+        if quant_type == "nf4":
+            output = torch.ops.torch_ipex.dequantize_4bit(A, "nf4", quant_state.shape, absmax, None, blocksize).t()
+        else:
+            output = dequantize_4bit_impl(A, quant_state, absmax, out, blocksize, quant_type)
+
+        return output
 
     def gemv_4bit(
         self,
@@ -96,7 +171,11 @@ def gemv_4bit(
         transposed_B=False,
         state: QuantState = None,
     ) -> torch.Tensor:
-        raise NotImplementedError
+        assert_on_xpu([A, B, out])
+        if state is None:
+            raise ValueError("state cannot be None. gemv_4bit() requires the state from quantize_4bit()")
+        output = gemm_4bit_impl(A, B, out, transposed_A, transposed_B, state)
+        return output
 
     def dequantize_blockwise(
         self,
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index 6cf64df28..3c730cb16 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -1006,11 +1006,6 @@ def dequantize_fp4(
     out: Optional[torch.Tensor] = None,
     blocksize: Optional[int] = None,
 ) -> Tensor:
-    if blocksize is None:
-        # Some AMD GPUs have warpsize 64
-        # Set default blocksize to 128 (~warpsize 64 in kernel) for HIP
-        blocksize = 64 if not HIP_ENVIRONMENT else 128
-
     return dequantize_4bit(A, quant_state, absmax, out, blocksize, "fp4")
 
 
@@ -1021,11 +1016,6 @@ def dequantize_nf4(
     out: Optional[torch.Tensor] = None,
     blocksize: Optional[int] = None,
 ) -> Tensor:
-    if blocksize is None:
-        # Some AMD GPUs have warpsize 64
-        # Set default blocksize to 128 (~warpsize 64 in kernel) for HIP
-        blocksize = 64 if not HIP_ENVIRONMENT else 128
-
     return dequantize_4bit(A, quant_state, absmax, out, blocksize, "nf4")
 
 
@@ -1035,7 +1025,7 @@ def dequantize_4bit(
     absmax: Optional[torch.Tensor] = None,
     out: Optional[torch.Tensor] = None,
     blocksize: Optional[int] = None,
-    quant_type="fp4",
+    quant_type=None,
 ) -> Tensor:
     """
     Dequantizes FP4 blockwise quantized values.
@@ -1064,6 +1054,14 @@ def dequantize_4bit(
         Dequantized tensor.
     """
     ensure_backend_is_available(A.device.type)
+    if quant_state is not None:
+        absmax = absmax or quant_state.absmax
+        quant_type = quant_type or quant_state.quant_type
+        blocksize = blocksize or quant_state.blocksize
+    if blocksize is None:
+        # Some AMD GPUs have warpsize 64
+        # Set default blocksize to 128 (~warpsize 64 in kernel) for HIP
+        blocksize = 64 if not HIP_ENVIRONMENT else 128
     return backends[A.device.type].dequantize_4bit(
         A, quant_state=quant_state, absmax=absmax, out=out, blocksize=blocksize, quant_type=quant_type
     )
@@ -1800,7 +1798,7 @@ class COOSparseTensor:
     def __init__(self, rows, cols, nnz, rowidx, colidx, values):
         assert rowidx.dtype == torch.int32
         assert colidx.dtype == torch.int32
-        if values.device == torch.device("cpu"):
+        if values.device == torch.device("cpu") or torch.device("xpu"):
             assert values.dtype in [torch.bfloat16, torch.half, torch.float]
         else:
             assert values.dtype == torch.float16
diff --git a/bitsandbytes/nn/__init__.py b/bitsandbytes/nn/__init__.py
index 96f4359bf..35bee393e 100644
--- a/bitsandbytes/nn/__init__.py
+++ b/bitsandbytes/nn/__init__.py
@@ -2,6 +2,7 @@
 #
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
+from ..backends import backends
 from .modules import (
     Embedding,
     Int8Params,
@@ -14,9 +15,12 @@
     StableEmbedding,
     SwitchBackLinearBnb,
 )
-from .triton_based_modules import (
-    StandardLinear,
-    SwitchBackLinear,
-    SwitchBackLinearGlobal,
-    SwitchBackLinearVectorwise,
-)
+
+# CPU and XPU backend do not need triton, and XPU so not support triton for now.
+if "xpu" not in backends.keys() and len(backends.keys()) > 1:
+    from .triton_based_modules import (
+        StandardLinear,
+        SwitchBackLinear,
+        SwitchBackLinearGlobal,
+        SwitchBackLinearVectorwise,
+    )
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
old mode 100644
new mode 100755
index 32854413f..2159c21e4
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -314,6 +314,9 @@ def cuda(self, device: Optional[Union[int, device, str]] = None, non_blocking: b
     def cpu(self, non_blocking: bool = False):
         return self.to(device="cpu", non_blocking=non_blocking)
 
+    def xpu(self, non_blocking: bool = False):
+        return self.to(device="xpu", non_blocking=non_blocking)
+
     @overload
     def to(
         self: T,
@@ -331,7 +334,7 @@ def to(self: T, tensor: Tensor, non_blocking: bool = ...) -> T: ...
     def to(self, *args, **kwargs):
         device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
 
-        if device is not None and device.type in ["cuda", "cpu"] and not self.bnb_quantized:
+        if device is not None and device.type in ["cuda", "cpu", "xpu"] and not self.bnb_quantized:
             return self._quantize(device)
         else:
             if self.quant_state is not None:
@@ -417,6 +420,7 @@ def __init__(
         # self.persistent_buffers = []  # TODO consider as way to save quant state
         self.compute_dtype = compute_dtype
         self.compute_type_is_set = False
+        self.ipex_linear_is_set = False
         self.quant_state = None
         self.quant_storage = quant_storage
 
@@ -445,35 +449,39 @@ def _save_to_state_dict(self, destination, prefix, keep_vars):
         save weight and bias,
         then fill state_dict with components of quant_state
         """
-        if (
-            getattr(self.weight, "quant_state", None) is not None
-            and getattr(self.weight.quant_state, "op_context", None) is not None
-        ):
-            context = self.weight.quant_state.op_context
-            self.weight.data = context.to_public(context.get_weight()).reshape([1, -1])
+        if getattr(self.weight, "quant_state", None) is not None and getattr(self.weight.quant_state, "ipex", False):
+            if self.weight.device.type == "cpu":
+                original_weight = torch.ops.ipex_prepack.woq_linear_unpack_weight(
+                    self.weight, "nf4", self.weight.quant_state.shape, 2
+                )
+                self.weight.data = original_weight.data
+            elif self.weight.device.type == "xpu":
+                self.weight.data = self.weight.data.reshape(1, -1)
+
+            self.weight.quant_state.ipex = False
 
         super()._save_to_state_dict(destination, prefix, keep_vars)  # saving weight and bias
 
         if getattr(self.weight, "quant_state", None) is not None:
-            if (
-                self.weight.quant_state.absmax.shape.numel() == 0
-                and getattr(self.weight.quant_state, "op_context", None) is not None
-            ):
-                self.weight.quant_state.absmax = context.get_scales().reshape(-1)
-                delattr(self.weight.quant_state, "op_context")
             for k, v in self.weight.quant_state.as_dict(packed=True).items():
                 destination[prefix + "weight." + k] = v if keep_vars else v.detach()
 
-    def forward(self, x: torch.Tensor):
-        # Check if ipex fusion can be used
+    def set_ipex_linear(self, x: torch.Tensor):
         if (
-            x.device.type == "cpu"
-            and not hasattr(self.weight.quant_state, "op_context")
+            (x.device.type in ("cpu", "xpu"))
+            and not getattr(self.weight.quant_state, "ipex", False)
             and self.weight.quant_state.shape[1] % self.weight.quant_state.blocksize == 0
             and self.weight.quant_state.quant_type == "nf4"
+            and not self.training
             and x.requires_grad == False
         ):
-            enable_ipex_fusion(self.weight, self.weight.quant_state)
+            enable_ipex_fusion(self)
+
+    def forward(self, x: torch.Tensor):
+        # Check if ipex fusion can be used
+        if not self.ipex_linear_is_set:
+            self.set_ipex_linear(x)
+            self.ipex_linear_is_set = True
 
         # weights are cast automatically as Int8Params, but the bias has to be cast manually
         if self.bias is not None and self.bias.dtype != x.dtype:
@@ -633,7 +641,20 @@ def __deepcopy__(self, memo):
 
     def cpu(self):
         # we store the 8-bit rows-major weight
-        B = self.data.contiguous().bfloat16().cpu()
+        B = self.data.contiguous().to(torch.bfloat16).cpu()
+        CB, CBt, SCB, SCBt, coo_tensorB = bnb.functional.double_quant(B)
+        if CBt is not None:
+            del CBt
+        if SCBt is not None:
+            del SCBt
+        self.data = CB
+        self.CB = CB
+        self.SCB = SCB
+        return self
+
+    def xpu(self):
+        # we store the 8-bit rows-major weight
+        B = self.data.contiguous().to(torch.float16).xpu()
         CB, CBt, SCB, SCBt, coo_tensorB = bnb.functional.double_quant(B)
         if CBt is not None:
             del CBt
@@ -669,6 +690,13 @@ def to(self, *args, **kwargs):
                 return self
             else:
                 return self.cpu()
+        elif device.type == "xpu":
+            if self.data.dtype == torch.int8:
+                self.data = self.data.contiguous().xpu()
+                self.CB = self.data
+                return self
+            else:
+                return self.xpu()
         else:
             new_param = Int8Params(
                 super().to(device=device, dtype=dtype, non_blocking=non_blocking),
diff --git a/bitsandbytes/utils.py b/bitsandbytes/utils.py
index 9e52c915d..adb36279c 100644
--- a/bitsandbytes/utils.py
+++ b/bitsandbytes/utils.py
@@ -200,28 +200,39 @@ def unpack_tensor_to_dict(tensor_data):
     return unpacked_dict
 
 
-def enable_ipex_fusion(weight, quant_state):
-    from bitsandbytes.backends.cpu_xpu_common import _ipex_cpu_version_prereq
-
-    if _ipex_cpu_version_prereq(2, 3):
-        import intel_extension_for_pytorch as ipex
-
-        lowp_mode = ipex.quantization.WoqLowpMode.BF16
-        quant_state.op_context = torch.ops.ipex_prepack.weight_only_qlinear_prepack(
-            weight.data.reshape([quant_state.shape[0], quant_state.shape[1] // 2]),
-            ipex.quantization.WoqWeightDtype.NF4,
+def enable_ipex_fusion(linear):
+    from bitsandbytes.backends.cpu_xpu_common import (
+        _ipex_cpu_version_prereq,
+        _ipex_xpu_version_prereq,
+        ipex_cpu_only,
+        ipex_xpu,
+    )
+
+    if ipex_cpu_only and _ipex_cpu_version_prereq(2, 5):
+        quant_state = linear.weight.quant_state
+        new_weight, new_scales, new_zeros, _, compensation = torch.ops.ipex_prepack.woq_linear_pack_weight(
+            linear.weight.data.reshape([quant_state.shape[0], quant_state.shape[1] // 2]),
+            "nf4",
             quant_state.shape,  # weight shape
             quant_state.absmax.view(quant_state.shape[0], quant_state.shape[1] // quant_state.blocksize),  # scales
             None,  # zero_points
             None,  # bias
-            None,  # g_idx
             None,  # batch_size
             quant_state.blocksize,
-            int(lowp_mode),
-            -1,  # act_quant_mode. -1 means don't quant activation
+            2,
         )
-        quant_state.absmax = torch.Tensor()
-        weight.data = torch.empty([1, 0], dtype=torch.uint8)
+    elif ipex_xpu and _ipex_xpu_version_prereq(2, 5):
+        quant_state = linear.weight.quant_state
+        new_weight = linear.weight.data.reshape([quant_state.shape[0], quant_state.shape[1] // 2])
+
+        new_scales = quant_state.absmax.view(quant_state.shape[0], quant_state.shape[1] // quant_state.blocksize)
+        new_zeros = None
+        compensation = None
+    linear.weight.data = new_weight.data
+    linear.weight.quant_state.ipex = True
+    linear.weight.quant_state.new_scales = new_scales
+    linear.weight.quant_state.new_zeros = new_zeros
+    linear.weight.quant_state.compensation = compensation
 
 
 class QuantState:
diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index d1acb2cd6..615dfd95e 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -208,8 +208,8 @@ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/YOUR_USERNAME/local/cuda-11.7
 |-------------|------------------------|---------------------------|-------------------------|------------|
 | **AMD ROCm** | 6.1+                   | 3.10+                     | minimum CDNA - `gfx90a`, RDNA - `gfx1100` | Alpha      |
 | **Apple Silicon (MPS)** | WIP                        | 3.10+                     | M1/M2 chips                    | Planned    |
-| **Intel CPU** | v2.4.0+ (`ipex`)         | 3.10+                     | Intel CPU | Alpha |
-| **Intel GPU** | v2.4.0+ (`ipex`)         | 3.10+                     | Intel GPU | Experimental |
+| **Intel CPU** | v2.5.0+ (`ipex`)         | 3.10+                     | Intel CPU | Alpha |
+| **Intel GPU** | v2.5.0+ (`ipex`)         | 3.10+                     | Intel GPU | Experimental |
 
 For each supported backend, follow the respective instructions below:
 
@@ -336,8 +336,6 @@ The below commands are for Linux. For installing on Windows, please adapt the be
 git clone --depth 1 -b multi-backend-refactor https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
 pip install intel_extension_for_pytorch
 pip install -r requirements-dev.txt
-cmake -DCOMPUTE_BACKEND=cpu -S .
-make
 pip install -e .   # `-e` for "editable" install, when developing BNB (otherwise leave that out)
 ```
 
diff --git a/docs/source/non_cuda_backends.mdx b/docs/source/non_cuda_backends.mdx
index 728606b7b..4c429fb2d 100644
--- a/docs/source/non_cuda_backends.mdx
+++ b/docs/source/non_cuda_backends.mdx
@@ -33,12 +33,12 @@ The following performance data is collected from Intel 4th Gen Xeon (SPR) platfo
 
 | Data Type | BF16 | INT8 | NF4 | FP4 |
 |---|---|---|---|---|
-| Speed-Up (vs BF16) | 1.0x | 0.6x | 2.3x | 0.03x |
+| Speed-Up (vs BF16) | 1.0x | 0.44x | 1.8x | 0.1x |
 | Memory (GB) | 13.1 | 7.6 | 5.0 | 4.6 |
 
 #### Fine-Tuning (CPU)
 
-| Data Type | AMP BF16 | INT8 | NF4 | FP4 |
+| Data Type | BF16 | INT8 | NF4 | FP4 |
 |---|---|---|---|---|
-| Speed-Up (vs AMP BF16) | 1.0x | 0.38x | 0.07x | 0.07x |
+| Speed-Up (vs BF16) | 1.0x | 0.38x | 0.1x | 0.1x |
 | Memory (GB) | 40 | 9 | 6.6 | 6.6 |

From 931569217fba9423dc176cf2956b96c625a96d3c Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Mon, 2 Dec 2024 16:55:21 +0800
Subject: [PATCH 19/29] fix cpu nf4 (#1432)

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 bitsandbytes/autograd/_functions.py | 3 ++-
 bitsandbytes/nn/modules.py          | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
index 9765def05..e188479f6 100644
--- a/bitsandbytes/autograd/_functions.py
+++ b/bitsandbytes/autograd/_functions.py
@@ -579,7 +579,8 @@ def matmul_4bit(
     assert quant_state is not None
     if A.device.type in ("cpu", "xpu") and A.requires_grad == False:
         if getattr(quant_state, "ipex", False):
-            out = F.gemv_4bit(A, B.t(), out, state=quant_state)
+            B = B.t() if len(B.shape) == 2 else B
+            out = F.gemv_4bit(A, B, out, state=quant_state)
             if bias is not None:
                 out += bias
             return out
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 2159c21e4..66f14edf7 100755
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -508,7 +508,8 @@ def forward(self, x: torch.Tensor):
             x = x.to(self.compute_dtype)
 
         bias = None if self.bias is None else self.bias.to(self.compute_dtype)
-        out = bnb.matmul_4bit(x, self.weight.t(), bias=bias, quant_state=self.weight.quant_state)
+        weight = self.weight.t() if len(self.weight.shape) == 2 else self.weight
+        out = bnb.matmul_4bit(x, weight, bias=bias, quant_state=self.weight.quant_state)
 
         out = out.to(inp_dtype)
 

From 994833378a51a96db6a74ee8071654def47007b2 Mon Sep 17 00:00:00 2001
From: Huazhong Ji <hzji210@gmail.com>
Date: Fri, 6 Dec 2024 22:45:55 +0800
Subject: [PATCH 20/29] Add Ascend NPU support for nf4 quant (#1422)

* Add npu support for nf4 quant

Co-authored-by: Slightwind <slightwindsec@gmail.com>
Co-authored-by: Ginray <ginray0215@gmail.com>

* code format

* update

* pass lint check and fix typos

* add npu to supported devices

---------

Co-authored-by: Slightwind <slightwindsec@gmail.com>
Co-authored-by: Ginray <ginray0215@gmail.com>
---
 CMakeLists.txt                          |  49 +++++-
 _typos.toml                             |   3 +
 bitsandbytes/__init__.py                |   1 +
 bitsandbytes/autograd/_functions.py     |  14 +-
 bitsandbytes/backends/cpu_xpu_common.py |   2 +-
 bitsandbytes/backends/npu.py            | 152 ++++++++++++++--
 bitsandbytes/cextension.py              |   5 +
 bitsandbytes/nn/modules.py              |  10 +-
 bitsandbytes/npu_specs.py               |  20 +++
 csrc/npu_kernels.cpp                    | 222 ++++++++++++++++++++++++
 csrc/npu_ops.cpp                        |  51 ++++++
 csrc/npu_ops.h                          |  28 +++
 csrc/pythonInterface.cpp                |  11 ++
 docs/source/installation.mdx            |  33 ++++
 14 files changed, 581 insertions(+), 20 deletions(-)
 create mode 100644 bitsandbytes/npu_specs.py
 create mode 100644 csrc/npu_kernels.cpp
 create mode 100644 csrc/npu_ops.cpp
 create mode 100644 csrc/npu_ops.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 315e0ff1b..20dd2b45d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,7 +3,7 @@
 #   For  GCC: `cmake -B build . && cmake --build build`
 #   For MSVC: `cmake -B build . && cmake --build build --config Release`
 # You can also use the following options and variables
-#  - COMPUTE_BACKEND: Set to `cpu`, `cuda`, `hip` or `mps` to select the backend
+#  - COMPUTE_BACKEND: Set to `cpu`, `cuda`, `hip`, `mps` or `npu` to select the backend
 #  - NO_CUBLASLT: Default OFF, will skip building/linking CUBLASLT support
 #  - CUDA_VERSION: The expected CUDA version, for sanity checking. The actual version
 #                  is whatever CMake finds on your path.
@@ -29,11 +29,12 @@ set(CUDA_FILES csrc/ops.cu csrc/kernels.cu)
 set(HIP_FILES csrc/ops.hip csrc/kernels.hip)
 set(MPS_FILES csrc/mps_ops.mm)
 set(METAL_FILES csrc/mps_kernels.metal)
+set(NPU_FILES csrc/npu_ops.cpp)
 # C++ sources are always included
 list(APPEND SRC_FILES ${CPP_FILES})
 
-set(COMPUTE_BACKEND "cpu" CACHE STRING "The compute backend to use (cpu, cuda, hip, mps)")
-set_property(CACHE COMPUTE_BACKEND PROPERTY STRINGS cpu cuda hip mps)
+set(COMPUTE_BACKEND "cpu" CACHE STRING "The compute backend to use (cpu, cuda, hip, mps, npu)")
+set_property(CACHE COMPUTE_BACKEND PROPERTY STRINGS cpu cuda hip mps npu)
 option(PTXAS_VERBOSE "Pass through -v flag to PTX Assembler" OFF)
 
 if(APPLE)
@@ -69,6 +70,11 @@ elseif(${COMPUTE_BACKEND} STREQUAL "mps")
     set(BUILD_CUDA OFF)
     set(BUILD_HIP OFF)
     set(BUILD_MPS ON)
+elseif(${COMPUTE_BACKEND} STREQUAL "npu")
+    set(BUILD_CUDA OFF)
+    set(BUILD_HIP OFF)
+    set(BUILD_MPS OFF)
+    set(BUILD_NPU ON)
 else()
     set(BUILD_CUDA OFF)
     set(BUILD_HIP OFF)
@@ -232,6 +238,33 @@ elseif(BUILD_MPS)
                 COMMENT "Compiling Metal kernels"
                 VERBATIM)
     add_custom_target(metallib DEPENDS "bitsandbytes/bitsandbytes.metallib")
+elseif(BUILD_NPU)
+    list(APPEND SRC_FILES ${NPU_FILES})
+
+    set(SOC_VERSION "Ascend910B4" CACHE STRING "system on chip type")
+    set(ASCEND_CANN_PACKAGE_PATH $ENV{ASCEND_HOME_PATH} CACHE
+        STRING "ASCEND CAN package installation directory"
+    )
+
+    # ${KERNEL_FILES} are used to compile library, push files written by ascendc in ${KERNEL_FILES}.
+    # ref to cmake/npu.cmake ascendc_library, cmake/cpu.cmake add_library
+    # file(GLOB KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/csrc/npu_kernels.cpp)
+    file(GLOB KERNEL_FILES csrc/npu_kernels.cpp)
+
+    if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+        set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+    elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+        set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+    else()
+        message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the can package is installed")
+    endif()
+    include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
+
+    # ascendc_library use to add kernel file to generate ascendc library
+    ascendc_library(ascendc_kernels_npu STATIC ${KERNEL_FILES})
+
+    string(APPEND BNB_OUTPUT_NAME "_npu")
+    add_compile_definitions(BUILD_NPU)
 else()
     string(APPEND BNB_OUTPUT_NAME "_cpu")
     set(GPU_SOURCES)
@@ -249,7 +282,11 @@ endif()
 
 set_source_files_properties(${CPP_FILES} PROPERTIES LANGUAGE CXX)
 add_library(bitsandbytes SHARED ${SRC_FILES})
-target_compile_features(bitsandbytes PUBLIC cxx_std_14)
+if(BUILD_NPU)
+    target_compile_features(bitsandbytes PUBLIC cxx_std_17)
+else()
+    target_compile_features(bitsandbytes PUBLIC cxx_std_14)
+endif()
 target_include_directories(bitsandbytes PUBLIC csrc include)
 
 
@@ -306,6 +343,10 @@ if(BUILD_MPS)
     add_dependencies(bitsandbytes metallib)
     target_link_libraries(bitsandbytes objc "-framework Foundation" "-framework Metal" "-framework MetalPerformanceShaders" "-framework MetalPerformanceShadersGraph")
 endif()
+if(BUILD_NPU)
+    target_compile_options(bitsandbytes PRIVATE -O2 -std=c++17)
+    target_link_libraries(bitsandbytes PRIVATE $<BUILD_INTERFACE:host_intf_pub> ascendc_kernels_npu)
+endif()
 
 if(WIN32)
     set_target_properties(bitsandbytes PROPERTIES PREFIX "lib")
diff --git a/_typos.toml b/_typos.toml
index e4e7287fb..ff4c9ae06 100644
--- a/_typos.toml
+++ b/_typos.toml
@@ -3,12 +3,15 @@
 [default]
 extend-ignore-re = [
     "@Ther-nul",  # valid Github user
+    "CANN",  # CANN (Compute Architecture for Neural Networks) is a heterogeneous computing architecture for Ascend NPU
 ]
 
 [default.extend-identifiers]
 
 [type.py.extend-words]
 "BA" = "BA"  # used as a commented-out variable in tests
+"cann" = "cann" # cann (Compute Architecture for Neural Networks) is a heterogeneous computing architecture for Ascend NPU
+
 
 [type.cuda.extend-words]
 "subtile" = "subtile"
diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
index c705137c0..f850140a1 100644
--- a/bitsandbytes/__init__.py
+++ b/bitsandbytes/__init__.py
@@ -25,6 +25,7 @@
 features = {"multi_backend"}
 supported_torch_devices = {
     "cuda",  # includes ROCm
+    "npu",  # Ascend NPU
     "xpu",  # Intel GPU
     "cpu",
 }
diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
index e188479f6..6440ab1b5 100644
--- a/bitsandbytes/autograd/_functions.py
+++ b/bitsandbytes/autograd/_functions.py
@@ -519,7 +519,12 @@ def forward(ctx, A, B, out=None, bias=None, quant_state: Optional[F.QuantState]
 
         # 1. Dequantize
         # 2. MatmulnN
-        output = torch.nn.functional.linear(A, F.dequantize_4bit(B, quant_state).to(A.dtype).t(), bias)
+        if A.device.type == "npu":
+            output = torch.matmul(A, F.dequantize_4bit(B, quant_state).to(A.dtype).t())
+            if bias is not None:
+                output += bias
+        else:
+            output = torch.nn.functional.linear(A, F.dequantize_4bit(B, quant_state).to(A.dtype).t(), bias)
 
         # 3. Save state
         ctx.state = quant_state
@@ -550,7 +555,10 @@ def backward(ctx, grad_output):
         # not supported by PyTorch. TODO: create work-around
         # if req_gradB: grad_B = torch.matmul(grad_output.t(), A)
         if req_gradA:
-            grad_A = torch.matmul(grad_output, F.dequantize_4bit(B, ctx.state).to(grad_output.dtype).t())
+            if grad_output.device.type == "npu":
+                grad_A = torch.matmul(grad_output, F.dequantize_4bit(B, ctx.state).to(grad_output.dtype))
+            else:
+                grad_A = torch.matmul(grad_output, F.dequantize_4bit(B, ctx.state).to(grad_output.dtype).t())
 
         return grad_A, grad_B, None, grad_bias, None
 
@@ -586,7 +594,7 @@ def matmul_4bit(
             return out
         else:
             return MatMul4Bit.apply(A, B, out, bias, quant_state)
-    elif A.numel() == A.shape[-1] and A.requires_grad == False:
+    elif A.numel() == A.shape[-1] and A.requires_grad == False and A.device.type != "npu":
         if A.shape[-1] % quant_state.blocksize != 0:
             warn(
                 f"Some matrices hidden dimension is not a multiple of {quant_state.blocksize} and efficient inference kernels are not supported for these (slow). Matrix input size found: {A.shape}",
diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index d2e0c2593..8fdf7569d 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -23,7 +23,7 @@
 
 gxx_available = False
 try:
-    subprocess.run(["g++", "--version"])
+    subprocess.run(["g++", "--version"], capture_output=True)  # hide terminal output
     gxx_available = True
 except BaseException:
     warnings.warn("g++ not found, torch.compile disabled for CPU/XPU.")
diff --git a/bitsandbytes/backends/npu.py b/bitsandbytes/backends/npu.py
index 1b3cb57d6..ecbc2f351 100644
--- a/bitsandbytes/backends/npu.py
+++ b/bitsandbytes/backends/npu.py
@@ -1,17 +1,32 @@
+import ctypes as ct
 from typing import Literal, Optional, Tuple, Union
 
 import torch
 
-from bitsandbytes.utils import QuantState
-
-from .base import Backend
-
 try:
     # to support Ascend NPU backend
     import torch_npu  # noqa: F401
 except ImportError:
     pass
 
+from bitsandbytes.cextension import lib
+from bitsandbytes.functional import (
+    get_4bit_type,
+    get_ptr,
+)
+from bitsandbytes.utils import QuantState
+
+from .base import Backend
+
+
+def assert_on_npu(tensors):
+    if not all(t.device.type == "npu" for t in tensors if t is not None):
+        raise TypeError(
+            "All input tensors to be on NPU, but found some tensors not be on NPU:\n"
+            f"{[(t.shape, t.device) if isinstance(t, torch.Tensor) else None for t in tensors]}"
+        )
+    return True
+
 
 class NPUBackend(Backend):
     def double_quant(
@@ -75,12 +90,62 @@ def quantize_4bit(
         A: torch.Tensor,
         absmax: Optional[torch.Tensor] = None,
         out: Optional[torch.Tensor] = None,
-        blocksize=64,
+        blocksize: Optional[int] = None,
         compress_statistics=False,
-        quant_type: Literal["fp4", "nf4"] = "fp4",
+        quant_type: Literal["fp4", "nf4"] = "nf4",
         quant_storage=torch.uint8,
     ) -> Tuple[torch.Tensor, QuantState]:
-        raise NotImplementedError
+        if quant_type not in ["nf4"]:
+            raise NotImplementedError(f"4-bit quantization data type {quant_type} is not implemented.")
+        if compress_statistics:
+            raise NotImplementedError("compress_statistics is not implemented.")
+        if blocksize is None:
+            blocksize = 128
+
+        prev_device = torch.npu.current_device()
+        torch.npu.set_device(A.device)
+        if A.dtype in [torch.float32, torch.float16, torch.bfloat16]:
+            data = [
+                -1.0,
+                -0.6961928009986877,
+                -0.5250730514526367,
+                -0.39491748809814453,
+                -0.28444138169288635,
+                -0.18477343022823334,
+                -0.09105003625154495,
+                0.0,
+                0.07958029955625534,
+                0.16093020141124725,
+                0.24611230194568634,
+                0.33791524171829224,
+                0.44070982933044434,
+                0.5626170039176941,
+                0.7229568362236023,
+                1.0,
+            ]
+            data = torch.tensor(data, device="npu", dtype=torch.float32).view(1, -1)
+            absmax = A.view(-1, blocksize).abs().max(dim=1, keepdim=True).values
+            a = A.view(-1, blocksize) / absmax.float()
+            diff = torch.abs(a.unsqueeze(-1) - data)
+            out = (torch.argmin(diff, dim=-1) + 8) % 16
+            out = out.reshape(-1, 2)
+            out = (out[:, 0] + out[:, 1] * 16).to(torch.uint8)
+        else:
+            raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
+        assert_on_npu([A, absmax, out])
+        torch.npu.set_device(prev_device)
+
+        code = get_4bit_type(quant_type, device=A.device)
+        state = QuantState(
+            absmax=absmax,
+            shape=A.shape,
+            dtype=A.dtype,
+            blocksize=blocksize,
+            code=code,
+            quant_type=quant_type,
+        )
+
+        return out, state
 
     def dequantize_4bit(
         self,
@@ -88,10 +153,77 @@ def dequantize_4bit(
         quant_state: Optional[QuantState] = None,
         absmax: Optional[torch.Tensor] = None,
         out: Optional[torch.Tensor] = None,
-        blocksize: int = 64,
-        quant_type: Literal["fp4", "nf4"] = "fp4",
+        blocksize: Optional[int] = None,
+        quant_type: Literal["fp4", "nf4"] = "nf4",
     ) -> torch.Tensor:
-        raise NotImplementedError
+        if blocksize is None:
+            blocksize = 128
+        supported_blocksizes = [2048, 4096, 1024, 512, 256, 128, 64]
+        if blocksize not in supported_blocksizes:
+            raise ValueError(
+                f"The blockwise of {blocksize} is not supported. Supported values: {supported_blocksizes}"
+            )
+
+        if quant_state is None:
+            assert absmax is not None and out is not None
+            quant_state = QuantState(
+                absmax=absmax, shape=out.shape, dtype=out.dtype, blocksize=blocksize, quant_type=quant_type
+            )
+        else:
+            absmax = quant_state.absmax
+
+        if out is None:
+            out = torch.empty(quant_state.shape, dtype=quant_state.dtype, device=A.device)
+
+        n = out.numel()
+
+        prev_device = torch.npu.current_device()
+        torch.npu.set_device(A.device)
+        assert_on_npu([A, absmax, out])
+
+        if quant_state.quant_type not in ["nf4"]:
+            raise NotImplementedError(f"4-bit quantization data type {quant_type} is not implemented.")
+
+        if out.dtype == torch.float32:
+            lib.cdequantize_blockwise_fp32_nf4(
+                get_ptr(A),
+                get_ptr(absmax),
+                get_ptr(out),
+                ct.c_int(quant_state.blocksize),
+                ct.c_int(n),
+                torch.npu.current_stream(),
+            )
+        elif out.dtype == torch.float16:
+            lib.cdequantize_blockwise_fp16_nf4(
+                get_ptr(A),
+                get_ptr(absmax),
+                get_ptr(out),
+                ct.c_int(quant_state.blocksize),
+                ct.c_int(n),
+                torch.npu.current_stream(),
+            )
+        elif out.dtype == torch.bfloat16:
+            # bf16: bf16 -> fp32 -> op -> fp32 -> bf16
+            absmax = absmax.to(torch.float32)
+            out = out.to(torch.float32)
+            lib.cdequantize_blockwise_fp32_nf4(
+                get_ptr(A),
+                get_ptr(absmax),
+                get_ptr(out),
+                ct.c_int(quant_state.blocksize),
+                ct.c_int(n),
+                torch.npu.current_stream(),
+            )
+            out = out.to(torch.bfloat16)
+        else:
+            raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
+        torch.npu.set_device(prev_device)
+        is_transposed = True if A.shape[0] == 1 else False
+
+        if is_transposed:
+            return out.t()
+        else:
+            return out
 
     def gemv_4bit(
         self,
diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py
index cc5d8deff..ec329cbb6 100644
--- a/bitsandbytes/cextension.py
+++ b/bitsandbytes/cextension.py
@@ -25,6 +25,7 @@
 
 from bitsandbytes.consts import DYNAMIC_LIBRARY_SUFFIX, PACKAGE_DIR
 from bitsandbytes.cuda_specs import CUDASpecs, get_cuda_specs, get_rocm_gpu_arch
+from bitsandbytes.npu_specs import get_npu_specs
 
 logger = logging.getLogger(__name__)
 
@@ -100,6 +101,10 @@ def get_native_library() -> BNBNativeLibrary:
             binary_path = cuda_binary_path
         else:
             logger.warning("Could not find the bitsandbytes %s binary at %r", BNB_BACKEND, cuda_binary_path)
+    npu_specs = get_npu_specs()
+    if npu_specs:
+        binary_path = PACKAGE_DIR / f"libbitsandbytes_npu{DYNAMIC_LIBRARY_SUFFIX}"
+
     logger.debug(f"Loading bitsandbytes native library from: {binary_path}")
     dll = ct.cdll.LoadLibrary(str(binary_path))
 
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 66f14edf7..781e22541 100755
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -314,6 +314,12 @@ def cuda(self, device: Optional[Union[int, device, str]] = None, non_blocking: b
     def cpu(self, non_blocking: bool = False):
         return self.to(device="cpu", non_blocking=non_blocking)
 
+    def npu(self, device: Optional[Union[int, device, str]] = None, non_blocking: bool = False):
+        # `torch.Tensor.to(<int num>)` is not supported by `torch_npu` (see this [issue](https://github.com/Ascend/pytorch/issues/16)).
+        if isinstance(device, int):
+            device = f"npu:{device}"
+        return self.to(device="npu" if device is None else device, non_blocking=non_blocking)
+
     def xpu(self, non_blocking: bool = False):
         return self.to(device="xpu", non_blocking=non_blocking)
 
@@ -334,7 +340,7 @@ def to(self: T, tensor: Tensor, non_blocking: bool = ...) -> T: ...
     def to(self, *args, **kwargs):
         device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
 
-        if device is not None and device.type in ["cuda", "cpu", "xpu"] and not self.bnb_quantized:
+        if device is not None and device.type in ["cuda", "cpu", "npu", "xpu"] and not self.bnb_quantized:
             return self._quantize(device)
         else:
             if self.quant_state is not None:
@@ -497,7 +503,7 @@ def forward(self, x: torch.Tensor):
                 self.weight.quant_state = self.quant_state
             else:
                 print(
-                    "FP4 quantization state not initialized. Please call .cuda() or .to(device) on the LinearFP4 layer first.",
+                    "FP4 quantization state not initialized. Please call .cuda(), .npu() or .to(device) on the LinearFP4 layer first.",
                 )
         if not self.compute_type_is_set:
             self.set_compute_type(x)
diff --git a/bitsandbytes/npu_specs.py b/bitsandbytes/npu_specs.py
new file mode 100644
index 000000000..7c7cd707e
--- /dev/null
+++ b/bitsandbytes/npu_specs.py
@@ -0,0 +1,20 @@
+import dataclasses
+
+import torch
+
+try:
+    import torch_npu  # noqa: F401
+except ImportError:
+    pass
+
+
+@dataclasses.dataclass(frozen=True)
+class NPUSpecs:
+    cann_version_string: str
+
+
+def get_npu_specs():
+    if hasattr(torch, "npu") and torch.npu.is_available():
+        return NPUSpecs(cann_version_string=torch.version.cann)
+    else:
+        return None
diff --git a/csrc/npu_kernels.cpp b/csrc/npu_kernels.cpp
new file mode 100644
index 000000000..c70e71681
--- /dev/null
+++ b/csrc/npu_kernels.cpp
@@ -0,0 +1,222 @@
+#include "kernel_operator.h"
+#include "npu_ops.h"
+
+using namespace AscendC;
+
+constexpr int32_t BUFFER_NUM = 1;
+
+constexpr half Q_COFF_0 = -0.377685546875;
+constexpr half Q_COFF_1 = -3.193359375;
+constexpr half Q_COFF_2 = 0.583984375;
+constexpr half Q_COFF_3 = 6.02734375;
+constexpr half Q_COFF_4 = 1.9560546875;
+constexpr half Q_COFF_5 = 7.08984375;
+
+#define CEIL32(num) (((num) + 32 - 1) / 32 * 32)
+#define CEIL_BASE(num, base) (((num) + (base) - 1) / (base) * (base))
+
+
+template <typename T, uint32_t TypeMode>
+class KernelDequantizeBlockwiseNf4 {
+public:
+    __aicore__ inline KernelDequantizeBlockwiseNf4() {}
+
+    __aicore__ inline void Init(GM_ADDR A, GM_ADDR absmax, GM_ADDR out, GM_ADDR tilingDevice, TPipe &pipe)
+    {
+        ASSERT(GetBlockNum() != 0 && "block dim can not be zero!");
+        auto *tiling_data = reinterpret_cast<__gm__ BlockwiseNf4TilingData *>(tilingDevice);
+        this->blocksize = tiling_data->blocksize;
+        uint32_t coreNum = tiling_data->coreNum;
+        uint32_t singleCoreNumel = tiling_data->singleCoreNumel;
+        uint32_t singleCoreNumelTail = tiling_data->singleCoreNumelTail;
+        uint32_t numel = tiling_data->numel;
+        uint32_t ubSize = tiling_data->ubSize;
+        uint32_t blockIdx = (uint32_t)GetBlockIdx();
+        if (coreNum - blockIdx == 1) {
+            this->CurCoreFP16Num = singleCoreNumelTail;
+        } else {
+            this->CurCoreFP16Num = singleCoreNumel;
+        }
+        constexpr uint32_t ELEMENT_BYTES = (TypeMode == 1) ? 4 : 2;  // FP32: 4bytes, FP16/BF16: 2bytes
+        uint32_t eachBatchPkgNum = (ubSize - 16 * ELEMENT_BYTES) /
+            (this->blocksize / 2 * BUFFER_NUM + ELEMENT_BYTES * BUFFER_NUM + this->blocksize *
+            (ELEMENT_BYTES * BUFFER_NUM + sizeof(half) + sizeof(uint32_t) + ELEMENT_BYTES));
+        if (eachBatchPkgNum >= 32 / ELEMENT_BYTES) {
+            eachBatchPkgNum = (eachBatchPkgNum / (32 / ELEMENT_BYTES)) * (32 / ELEMENT_BYTES);
+        } else {
+            eachBatchPkgNum = (eachBatchPkgNum / 2) * 2;
+        }
+        this->eachBatchFP16Num = this->blocksize * eachBatchPkgNum; // 64 * 288
+
+        // gm, 32-byte alignment
+        uint32_t AOffset = singleCoreNumel / 2 * blockIdx;
+        uint32_t ABufferSize = singleCoreNumel / 2;
+        AGm.SetGlobalBuffer((__gm__ int8_t*)A + AOffset, ABufferSize);
+        uint32_t absmaxOffset = singleCoreNumel / this->blocksize * blockIdx;
+        uint32_t absmaxBufferSize = singleCoreNumel / this->blocksize;
+        absmaxGm.SetGlobalBuffer((__gm__ T*)absmax + absmaxOffset, absmaxBufferSize);
+        uint32_t outOffset = singleCoreNumel * blockIdx;
+        uint32_t outBufferSize = singleCoreNumel;
+        outGm.SetGlobalBuffer((__gm__ T*)out + outOffset, outBufferSize);
+
+        // TQue, 32-byte alignment
+        pipe.InitBuffer(inQueueA, BUFFER_NUM, this->eachBatchFP16Num / 2);
+        pipe.InitBuffer(inQueueAbsmax, BUFFER_NUM, CEIL32(eachBatchPkgNum * ELEMENT_BYTES));
+        pipe.InitBuffer(outQueueOut, BUFFER_NUM, this->eachBatchFP16Num * ELEMENT_BYTES);
+
+        // TBuf, 32-byte alignment
+        pipe.InitBuffer(calcNf4ToFloat, 16 * ELEMENT_BYTES);
+        pipe.InitBuffer(calcAFP16, this->eachBatchFP16Num * sizeof(half));
+        pipe.InitBuffer(calcAUint32, this->eachBatchFP16Num * sizeof(uint32_t));
+        pipe.InitBuffer(calcAbsmaxBuf, this->eachBatchFP16Num * ELEMENT_BYTES);
+    }
+
+    __aicore__ inline void Process(void)
+    {
+        Compute();
+    }
+
+private:
+    __aicore__ inline void initNf4ToFloat(LocalTensor<T> &nf4ToFloat)
+    {
+        if constexpr (TypeMode == 1) {
+            nf4ToFloat(0) = static_cast<float32_t>(-1.0);
+            nf4ToFloat(1) = static_cast<float32_t>(-0.6961928009986877);
+            nf4ToFloat(2) = static_cast<float32_t>(-0.5250730514526367);
+            nf4ToFloat(3) = static_cast<float32_t>(-0.39491748809814453);
+            nf4ToFloat(4) = static_cast<float32_t>(-0.28444138169288635);
+            nf4ToFloat(5) = static_cast<float32_t>(-0.18477343022823334);
+            nf4ToFloat(6) = static_cast<float32_t>(-0.09105003625154495);
+            nf4ToFloat(7) = static_cast<float32_t>(0.0);
+            nf4ToFloat(8) = static_cast<float32_t>(0.07958029955625534);
+            nf4ToFloat(9) = static_cast<float32_t>(0.16093020141124725);
+            nf4ToFloat(10) = static_cast<float32_t>(0.24611230194568634);
+            nf4ToFloat(11) = static_cast<float32_t>(0.33791524171829224);
+            nf4ToFloat(12) = static_cast<float32_t>(0.44070982933044434);
+            nf4ToFloat(13) = static_cast<float32_t>(0.5626170039176941);
+            nf4ToFloat(14) = static_cast<float32_t>(0.7229568362236023);
+            nf4ToFloat(15) = static_cast<float32_t>(1.0);
+        } else if constexpr (TypeMode == 2) {
+            nf4ToFloat(0) = static_cast<half>(-1.0);
+            nf4ToFloat(1) = static_cast<half>(-0.6962890625);
+            nf4ToFloat(2) = static_cast<half>(-0.52490234375);
+            nf4ToFloat(3) = static_cast<half>(-0.39501953125);
+            nf4ToFloat(4) = static_cast<half>(-0.284423828125);
+            nf4ToFloat(5) = static_cast<half>(-0.184814453125);
+            nf4ToFloat(6) = static_cast<half>(-0.091064453125);
+            nf4ToFloat(7) = static_cast<half>(0.0);
+            nf4ToFloat(8) = static_cast<half>(0.07958984375);
+            nf4ToFloat(9) = static_cast<half>(0.160888671875);
+            nf4ToFloat(10) = static_cast<half>(0.24609375);
+            nf4ToFloat(11) = static_cast<half>(0.337890625);
+            nf4ToFloat(12) = static_cast<half>(0.440673828125);
+            nf4ToFloat(13) = static_cast<half>(0.5625);
+            nf4ToFloat(14) = static_cast<half>(0.72314453125);
+            nf4ToFloat(15) = static_cast<half>(1.0);
+        }
+    }
+
+    __aicore__ inline void Compute(void)
+    {
+        constexpr uint32_t ELEMENT_BYTES = (TypeMode == 1) ? 4 : 2;  // FP32: 4bytes, FP16/BF16: 2bytes
+        LocalTensor<int8_t> ALocal = inQueueA.AllocTensor<int8_t>();
+        LocalTensor<T> absmaxLocal = inQueueAbsmax.AllocTensor<T>();
+        LocalTensor<T> outLocal = outQueueOut.AllocTensor<T>();
+
+        LocalTensor<half> AFP16 = calcAFP16.Get<half>();
+        LocalTensor<int32_t> AInt32 = calcAUint32.Get<int32_t>();
+        LocalTensor<T> absmaxBuf = calcAbsmaxBuf.Get<T>();
+        LocalTensor<T> nf4ToFloat = calcNf4ToFloat.Get<T>();
+        initNf4ToFloat(nf4ToFloat);
+
+        DataCopyParams dataCopyParams = {1, 0, 0, 0};
+        uint32_t curBatchNumel = this->eachBatchFP16Num;
+        uint32_t curBatchPkgNum = curBatchNumel / this->blocksize;
+
+        uint32_t batchCount = (this->CurCoreFP16Num + this->eachBatchFP16Num - 1) / this->eachBatchFP16Num;
+        for (uint32_t batchIdx = 0; batchIdx < batchCount; batchIdx++) {
+            if (batchCount - batchIdx == 1) {
+                curBatchNumel = this->CurCoreFP16Num - this->eachBatchFP16Num * batchIdx;
+                curBatchPkgNum = (curBatchNumel + this->blocksize - 1) / this->blocksize;
+            }
+
+            dataCopyParams.blockLen = curBatchNumel / 2;  // Byte
+            DataCopyPad(ALocal, AGm[this->eachBatchFP16Num / 2 * batchIdx], dataCopyParams, {true, 0, 0, 0});
+            dataCopyParams.blockLen = ELEMENT_BYTES * curBatchPkgNum;  // Byte
+            uint32_t gmOffset = this->eachBatchFP16Num / this->blocksize * batchIdx;
+            DataCopyPad(absmaxLocal, absmaxGm[gmOffset], dataCopyParams, {true, 0, 0, 0});
+            set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+            wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+            pipe_barrier(PIPE_ALL);
+
+            LocalTensor<int4b_t> AInt4 = ALocal.ReinterpretCast<int4b_t>();
+            Cast(AFP16, AInt4, RoundMode::CAST_NONE, curBatchNumel);
+            pipe_barrier(PIPE_V);
+            Adds(AFP16, AFP16, static_cast<half>(8), curBatchNumel);
+            pipe_barrier(PIPE_V);
+            if constexpr (TypeMode == 1) {
+                Muls(AFP16, AFP16, static_cast<half>(4), curBatchNumel);
+            } else {
+                Muls(AFP16, AFP16, static_cast<half>(2), curBatchNumel);
+            }
+            pipe_barrier(PIPE_V);
+            Cast(AInt32, AFP16, RoundMode::CAST_ROUND, curBatchNumel);
+            pipe_barrier(PIPE_V);
+            LocalTensor<uint32_t> AUint32 = AInt32.ReinterpretCast<uint32_t>();
+            Gather<T>(outLocal, nf4ToFloat, AUint32, 0, curBatchNumel);
+            pipe_barrier(PIPE_V);
+            uint32_t dstShape[] = {curBatchPkgNum, this->blocksize};
+            uint32_t srcShape[] = {curBatchPkgNum, 1};
+            BroadCast<T, 2, 1>(absmaxBuf, absmaxLocal, dstShape, srcShape);
+            pipe_barrier(PIPE_ALL);
+            Mul(outLocal, outLocal, absmaxBuf, curBatchNumel);
+            pipe_barrier(PIPE_ALL);
+
+            dataCopyParams.blockLen = ELEMENT_BYTES * curBatchNumel;  // Byte
+            DataCopyPad(outGm[batchIdx * this->eachBatchFP16Num], outLocal, dataCopyParams);
+            pipe_barrier(PIPE_MTE3);
+        }
+        pipe_barrier(PIPE_ALL);
+
+        inQueueA.FreeTensor(ALocal);
+        inQueueAbsmax.FreeTensor(absmaxLocal);
+        outQueueOut.FreeTensor(outLocal);
+    }
+
+private:
+    TQue<QuePosition::VECIN, BUFFER_NUM> inQueueA;
+    TQue<QuePosition::VECIN, BUFFER_NUM> inQueueAbsmax;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> outQueueOut;
+    TBuf<TPosition::VECCALC> calcAFP16;
+    TBuf<TPosition::VECCALC> calcAUint32;
+    TBuf<TPosition::VECCALC> calcNf4ToFloat;
+    TBuf<TPosition::VECCALC> calcAbsmaxBuf;
+    GlobalTensor<int8_t> AGm;
+    GlobalTensor<T> absmaxGm;
+    GlobalTensor<T> outGm;
+    uint32_t blocksize;
+    uint32_t CurCoreFP16Num;
+    uint32_t eachBatchFP16Num;
+};
+
+
+
+extern "C" {
+
+__global__ __aicore__ void dequantize_blockwise_fp32_nf4(GM_ADDR A, GM_ADDR absmax, GM_ADDR out, GM_ADDR tiling)
+{
+    TPipe pipe;
+    KernelDequantizeBlockwiseNf4<float32_t, 1> op;
+    op.Init(A, absmax, out, tiling, pipe);
+    op.Process();
+}
+
+__global__ __aicore__ void dequantize_blockwise_fp16_nf4(GM_ADDR A, GM_ADDR absmax, GM_ADDR out, GM_ADDR tiling)
+{
+    TPipe pipe;
+    KernelDequantizeBlockwiseNf4<half, 2> op;
+    op.Init(A, absmax, out, tiling, pipe);
+    op.Process();
+}
+
+}
diff --git a/csrc/npu_ops.cpp b/csrc/npu_ops.cpp
new file mode 100644
index 000000000..fb5ecef2f
--- /dev/null
+++ b/csrc/npu_ops.cpp
@@ -0,0 +1,51 @@
+#include <iostream>
+#include "acl/acl.h"
+#include "tiling/platform/platform_ascendc.h"
+#include "npu_ops.h"
+
+#include "aclrtlaunch_dequantize_blockwise_fp32_nf4.h"
+#include "aclrtlaunch_dequantize_blockwise_fp16_nf4.h"
+
+
+extern "C" {
+
+int32_t get_dequantize_blockwise_nf4_tiling(uint32_t blocksize, uint32_t n, BlockwiseNf4TilingData *tiling) {
+    tiling->ubSize = 196 * 1024;
+    uint32_t coreNum = 40;
+    uint32_t totalPkgNum = (n + blocksize - 1) / blocksize;
+    uint32_t singleCorePkgNum = (totalPkgNum + coreNum - 1) / coreNum;
+    coreNum = (totalPkgNum + singleCorePkgNum - 1) / singleCorePkgNum;
+    uint32_t singleCoreNumel = singleCorePkgNum * blocksize;
+    uint32_t singleCoreNumelTail = n % singleCoreNumel;
+    if (singleCoreNumelTail == 0) {
+        singleCoreNumelTail = singleCoreNumel;
+    }
+    tiling->coreNum = coreNum;
+    tiling->blocksize = blocksize;
+    tiling->numel = n;
+    tiling->singleCoreNumel = singleCoreNumel;
+    tiling->singleCoreNumelTail = singleCoreNumelTail;
+    return 0;
+}
+
+void dequantizeBlockwiseNf4(uint8_t *A, uint8_t *absmax, uint8_t *out, uint32_t blocksize, uint32_t n, void* stream, const uint32_t type_mode) {
+    uint32_t blockDim = 40;
+    size_t tilingSize = sizeof(struct BlockwiseNf4TilingData);
+    BlockwiseNf4TilingData *tilingHost;
+    tilingHost = (struct BlockwiseNf4TilingData *)malloc(tilingSize);
+    uint32_t error = get_dequantize_blockwise_nf4_tiling(blocksize, n, tilingHost);
+    if (error != 0) {
+        printf("[!] error\n");
+    }
+    uint8_t *tilingDevice = nullptr;
+    aclrtMalloc((void **)&tilingDevice, tilingSize, ACL_MEM_MALLOC_NORMAL_ONLY);
+    aclrtMemcpyAsync((void *)tilingDevice, tilingSize, tilingHost, tilingSize, ACL_MEMCPY_HOST_TO_DEVICE, stream);
+    if (type_mode == 1) {
+        ACLRT_LAUNCH_KERNEL(dequantize_blockwise_fp32_nf4)(blockDim, stream, A, absmax, out, tilingDevice);
+    } else if (type_mode == 2) {
+        ACLRT_LAUNCH_KERNEL(dequantize_blockwise_fp16_nf4)(blockDim, stream, A, absmax, out, tilingDevice);
+    }
+    aclrtFree(tilingDevice);
+}
+
+}
diff --git a/csrc/npu_ops.h b/csrc/npu_ops.h
new file mode 100644
index 000000000..d7a26cd34
--- /dev/null
+++ b/csrc/npu_ops.h
@@ -0,0 +1,28 @@
+#ifndef NPU_OPS_H
+#define NPU_OPS_H
+#include <cstdint>
+
+#define CHECK_ACL(x)                                                                        \
+    do {                                                                                    \
+        aclError __ret = x;                                                                 \
+        if (__ret != ACL_ERROR_NONE) {                                                      \
+            std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \
+        }                                                                                   \
+    } while (0);
+
+
+struct BlockwiseNf4TilingData {
+    uint32_t coreNum;
+    uint32_t blocksize;
+    uint32_t numel;
+    uint32_t singleCoreNumel;
+    uint32_t singleCoreNumelTail;
+    uint32_t ubSize;
+};
+
+extern "C" {
+
+void dequantizeBlockwiseNf4(uint8_t *A, uint8_t *absmax, uint8_t *out, uint32_t blocksize, uint32_t n, void* stream, const uint32_t type_mode);
+
+}
+#endif
diff --git a/csrc/pythonInterface.cpp b/csrc/pythonInterface.cpp
index be6abc070..2d3031936 100644
--- a/csrc/pythonInterface.cpp
+++ b/csrc/pythonInterface.cpp
@@ -12,6 +12,9 @@
 #if BUILD_MPS
 // #include <mps_ops.h>
 #endif
+#if BUILD_NPU
+#include <npu_ops.h>
+#endif
 #include <cpu_ops.h>
 
 // We cannot call templated code from C, so we wrap the template in a C compatible call here if necessary.
@@ -601,6 +604,14 @@ extern "C"
 
 #endif
 
+#if BUILD_NPU
+	void cdequantize_blockwise_fp32_nf4(uint8_t *A, uint8_t *absmax, uint8_t *out, uint32_t blocksize, uint32_t n, void* stream)
+	{ dequantizeBlockwiseNf4(A, absmax, out, blocksize, n, stream, 1); }
+
+	void cdequantize_blockwise_fp16_nf4(uint8_t *A, uint8_t *absmax, uint8_t *out, uint32_t blocksize, uint32_t n, void* stream)
+	{ dequantizeBlockwiseNf4(A, absmax, out, blocksize, n, stream, 2); }
+#endif
+
 	void cquantize_blockwise_cpu_fp32(float *code, float *A, float *absmax, unsigned char *out, long long blocksize, long long n){ quantize_cpu(code, A, absmax, out, blocksize, n); }
 	void cdequantize_blockwise_cpu_fp32(float *code, unsigned char *A, float *absmax, float *out, long long blocksize, long long n){ dequantize_cpu(code, A, absmax, out, blocksize, n); }
 }
diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index 615dfd95e..79613856f 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -210,6 +210,7 @@ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/YOUR_USERNAME/local/cuda-11.7
 | **Apple Silicon (MPS)** | WIP                        | 3.10+                     | M1/M2 chips                    | Planned    |
 | **Intel CPU** | v2.5.0+ (`ipex`)         | 3.10+                     | Intel CPU | Alpha |
 | **Intel GPU** | v2.5.0+ (`ipex`)         | 3.10+                     | Intel GPU | Experimental |
+| **Ascend NPU** | 2.1.0+ (`torch_npu`)         | 3.10+                     | Ascend NPU | Experimental |
 
 For each supported backend, follow the respective instructions below:
 
@@ -251,6 +252,13 @@ Compatible hardware and functioning `import intel_extension_for_pytorch as ipex`
 
 Please refer to [the official Intel installations instructions](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=cpu&version=v2.4.0%2bcpu&os=linux%2fwsl2) for guidance on how to pip install the necessary `intel_extension_for_pytorch` dependency.
 
+</hfoption>
+<hfoption id="Ascend NPU">
+
+Compatible hardware and functioning `import torch_npu` capable environment with Python `3.10` as the minimum requirement.
+
+Please refer to [the official Ascend installations instructions](https://www.hiascend.com/document/detail/zh/Pytorch/60RC3/configandinstg/instg/insg_0001.html) for guidance on how to pip install the necessary `torch_npu` dependency.
+
 </hfoption>
 <hfoption id="Apple Silicon (MPS)">
 
@@ -339,6 +347,31 @@ pip install -r requirements-dev.txt
 pip install -e .   # `-e` for "editable" install, when developing BNB (otherwise leave that out)
 ```
 
+</hfoption>
+<hfoption id="Ascend NPU">
+
+#### Ascend NPU
+
+> [!TIP]
+> Ascend NPU backend only supports building from source; for now, please follow the instructions below.
+
+
+```
+# Install bitsandbytes from source
+# Clone bitsandbytes repo, Ascend NPU backend is currently enabled on multi-backend-refactor branch
+git clone -b multi-backend-refactor https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
+
+# Install dependencies
+pip install -r requirements-dev.txt
+
+# Compile & install
+apt-get install -y build-essential cmake  # install build tools dependencies, unless present
+cmake -DCOMPUTE_BACKEND=npu -S .
+make
+pip install -e .   # `-e` for "editable" install, when developing BNB (otherwise leave that out)
+```
+
+
 </hfoption>
 <hfoption id="Apple Silicon (MPS)">
 

From 7e6f8657abd4b9547031585c0dc2af50a3160e80 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Wed, 18 Dec 2024 00:27:14 +0800
Subject: [PATCH 21/29] fix device check (#1453)

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 bitsandbytes/nn/modules.py | 2 +-
 bitsandbytes/utils.py      | 8 ++++----
 setup.py                   | 4 ++--
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 781e22541..ad5a7d443 100755
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -481,7 +481,7 @@ def set_ipex_linear(self, x: torch.Tensor):
             and not self.training
             and x.requires_grad == False
         ):
-            enable_ipex_fusion(self)
+            enable_ipex_fusion(self, x)
 
     def forward(self, x: torch.Tensor):
         # Check if ipex fusion can be used
diff --git a/bitsandbytes/utils.py b/bitsandbytes/utils.py
index adb36279c..02c9ac2ca 100644
--- a/bitsandbytes/utils.py
+++ b/bitsandbytes/utils.py
@@ -200,15 +200,15 @@ def unpack_tensor_to_dict(tensor_data):
     return unpacked_dict
 
 
-def enable_ipex_fusion(linear):
+def enable_ipex_fusion(linear, x):
     from bitsandbytes.backends.cpu_xpu_common import (
         _ipex_cpu_version_prereq,
         _ipex_xpu_version_prereq,
-        ipex_cpu_only,
+        ipex_cpu,
         ipex_xpu,
     )
 
-    if ipex_cpu_only and _ipex_cpu_version_prereq(2, 5):
+    if x.device.type == "cpu" and ipex_cpu and _ipex_cpu_version_prereq(2, 5):
         quant_state = linear.weight.quant_state
         new_weight, new_scales, new_zeros, _, compensation = torch.ops.ipex_prepack.woq_linear_pack_weight(
             linear.weight.data.reshape([quant_state.shape[0], quant_state.shape[1] // 2]),
@@ -221,7 +221,7 @@ def enable_ipex_fusion(linear):
             quant_state.blocksize,
             2,
         )
-    elif ipex_xpu and _ipex_xpu_version_prereq(2, 5):
+    elif x.device.type == "xpu" and ipex_xpu and _ipex_xpu_version_prereq(2, 5):
         quant_state = linear.weight.quant_state
         new_weight = linear.weight.data.reshape([quant_state.shape[0], quant_state.shape[1] // 2])
 
diff --git a/setup.py b/setup.py
index e8d3f547c..4002ee268 100644
--- a/setup.py
+++ b/setup.py
@@ -27,8 +27,8 @@ def get_latest_semver_tag():
     tags = subprocess.check_output(["git", "tag"], text=True).splitlines()
     semver_tags = [tag for tag in tags if tag.count(".") == 2 and all(part.isdigit() for part in tag.split("."))]
     if not semver_tags:
-        print("No valid semantic version tags found, use 0.0.1 defaultly")
-        semver_tags = ["0.0.1"]
+        print("No valid semantic version tags found, use 1.0.0 defaultly")
+        semver_tags = ["1.0.0"]
     return sorted(semver_tags, key=lambda s: list(map(int, s.split("."))))[-1]
 
 

From f6025bcae1395b1f3b5a993c4ac3ddde1dfda699 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Thu, 23 Jan 2025 05:17:34 +0800
Subject: [PATCH 22/29] Enable double quant on Intel CPU and XPU (#1472)

* fix dequant 8bit

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* support double quant on intel cpu and xpu

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* fix format

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* fix shape

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* fix 4bit format

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* fix device error for xpu

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* fix 4bit tensor shape

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* fix nf4 xpu finetune

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

---------

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 bitsandbytes/backends/cpu_xpu_common.py | 71 ++++++++++++++++++-------
 bitsandbytes/backends/xpu.py            |  2 +-
 bitsandbytes/nn/modules.py              |  5 +-
 bitsandbytes/utils.py                   | 31 +++++++++--
 4 files changed, 83 insertions(+), 26 deletions(-)

diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index 8fdf7569d..75f647939 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -3,11 +3,14 @@
 import warnings
 
 import torch
+import torch.nn.functional as F
 
 from bitsandbytes.functional import (
     QuantState,
+    create_dynamic_map,
     get_4bit_type,
 )
+from bitsandbytes.utils import reverse_4bit_compress_format
 
 try:
     # to support Intel CPU/GPU (XPU) backend
@@ -279,8 +282,9 @@ def mm_dequant_impl(
     0.8333333: 3,  # 0b0011
 }
 
+INT8_QUANT_TABLE = create_dynamic_map().tolist()
+
 
-@_maybe_torch_compile
 def quantize_4bit_impl(
     A: Tensor,
     absmax: Tensor = None,
@@ -314,7 +318,7 @@ def quantize_4bit_impl(
     tuple(torch.Tensor, torch.Size, torch.dtype, int):
         The quantization state to undo the quantization.
     """
-    if quant_type not in ["nf4", "fp4"]:
+    if quant_type not in ["nf4", "fp4", "int8"]:
         raise NotImplementedError(f"4-bit quantization data type {quant_type} is not implemented for CPU/XPU.")
     if quant_type == "fp4":
         warnings.warn("fp4 quantization is currently slow on CPU/XPU. Please Use nf4 instead for better performance.")
@@ -355,14 +359,34 @@ def quantize_4bit_impl(
         for key, val in FP4_QUANT_TABLE.items():
             out_uint8[abs_scaled_A > key] = val
         out_uint8 += sign.to(torch.uint8) * 8
-    if out_uint8.size(-1) % 2:
-        out_uint8 = torch.nn.functional.pad(out_uint8, (0, 1), value=0)
-    out[:] = out_uint8[1::2].bitwise_left_shift(4).bitwise_or_(out_uint8[::2])
+    elif quant_type == "int8":
+        for i in range(len(INT8_QUANT_TABLE)):
+            out_uint8[scaled_A > INT8_QUANT_TABLE[i]] = i
 
-    code = get_4bit_type(quant_type, device=A.device)
+    if quant_type == "int8":
+        out = out_uint8
+        code = torch.Tensor(INT8_QUANT_TABLE).to(A.device)
+    else:
+        if out_uint8.size(-1) % 2:
+            out_uint8 = torch.nn.functional.pad(out_uint8, (0, 1), value=0)
+        out[:] = out_uint8[::2].bitwise_left_shift(4).bitwise_or_(out_uint8[1::2])
+        code = get_4bit_type(quant_type, device=A.device)
 
     if compress_statistics:
-        raise NotImplementedError("bnb_4bit_use_double_quant is not supported yet for CPU/XPU")
+        offset = absmax.mean()
+        absmax -= offset
+        qabsmax, state2 = quantize_4bit_impl(absmax, blocksize=256, quant_type="int8")
+        del absmax
+        state = QuantState(
+            absmax=qabsmax,
+            shape=input_shape,
+            dtype=A.dtype,
+            blocksize=blocksize,
+            code=code,
+            quant_type=quant_type,
+            offset=offset,
+            state2=state2,
+        )
     else:
         state = QuantState(
             absmax=absmax,
@@ -373,7 +397,21 @@ def quantize_4bit_impl(
             quant_type=quant_type,
         )
 
-    return out.unsqueeze(0), state
+    return out.reshape(-1, 1), state
+
+
+def dequant_8bit(A, offset, quant_state):
+    assert A.dtype == torch.uint8
+    absmax = quant_state.code[A.reshape(-1).int()]
+    blocks = absmax.shape[-1] // 256
+    res = absmax.shape[-1] % 256
+    if res != 0:
+        absmax = F.pad(absmax, (0, 256 - res), mode="constant", value=0)
+    absmax = (absmax.view(-1, 256) * quant_state.absmax.view(-1, 1)).to(quant_state.dtype).reshape(-1)
+    absmax = absmax[: blocks * 256 + res]
+    absmax = absmax.reshape(A.shape)
+    absmax += offset
+    return absmax
 
 
 @_maybe_torch_compile
@@ -411,12 +449,8 @@ def dequantize_4bit_impl(
     torch.Tensor:
         Dequantized tensor.
     """
-    if A.shape[0] == 1:
-        transpose = False
-        A = A.squeeze(0)
-    elif A.shape[1] == 1:
-        transpose = True
-        A = A.squeeze(1)
+    transpose = True if A.shape[0] == 1 else False
+    A = A.reshape(-1)
 
     if quant_state is None:
         assert absmax is not None and out is not None
@@ -438,17 +472,18 @@ def dequantize_4bit_impl(
         )
 
     if quant_state.nested:
-        raise NotImplementedError("bnb_4bit_use_double_quant is not supported yet for CPU/XPU")
+        absmax = dequant_8bit(absmax, quant_state.offset, quant_state.state2)
 
     if ipex_cpu_only and _ipex_cpu_version_prereq(2, 5) and getattr(quant_state, "ipex", False):
-        A = torch.ops.ipex_prepack.woq_linear_unpack_weight(A, "nf4", quant_state.shape, 2)
+        ipex_weight = torch.ops.ipex_prepack.woq_linear_unpack_weight(A, "nf4", quant_state.shape, 2)
+        A = reverse_4bit_compress_format(ipex_weight)
         quant_state.ipex = False
 
     # Map nf4 to [-1, 1]
     out_dq = torch.empty(A.size(0) * 2, dtype=torch.int32, device=A.device)
     n = out_dq.numel()
-    out_dq[::2] = A & 0xF
-    out_dq[1::2] = A >> 4
+    out_dq[1::2] = A & 0xF
+    out_dq[::2] = A >> 4
     # quant_state.code is fp32, cast to quant_state dtype to avoid the mismatch issue
     quant_state.code = quant_state.code.to(quant_state.dtype)
     out_dq = quant_state.code[out_dq]
diff --git a/bitsandbytes/backends/xpu.py b/bitsandbytes/backends/xpu.py
index bc13963e6..aca0a0103 100644
--- a/bitsandbytes/backends/xpu.py
+++ b/bitsandbytes/backends/xpu.py
@@ -155,7 +155,7 @@ def dequantize_4bit(
         if blocksize is None:
             blocksize = 64
         assert_on_xpu([A, absmax, out])
-        if quant_type == "nf4":
+        if quant_type == "nf4" and getattr(quant_state, "ipex", False):
             output = torch.ops.torch_ipex.dequantize_4bit(A, "nf4", quant_state.shape, absmax, None, blocksize).t()
         else:
             output = dequantize_4bit_impl(A, quant_state, absmax, out, blocksize, quant_type)
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index ad5a7d443..2320ffd39 100755
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -20,6 +20,7 @@
     LINEAR_8BIT_WEIGHTS_FORMAT_MAPPING,
     OutlierTracer,
     enable_ipex_fusion,
+    reverse_4bit_compress_format,
 )
 
 T = TypeVar("T", bound="torch.nn.Module")
@@ -460,9 +461,9 @@ def _save_to_state_dict(self, destination, prefix, keep_vars):
                 original_weight = torch.ops.ipex_prepack.woq_linear_unpack_weight(
                     self.weight, "nf4", self.weight.quant_state.shape, 2
                 )
-                self.weight.data = original_weight.data
+                self.weight.data = reverse_4bit_compress_format(original_weight.data)
             elif self.weight.device.type == "xpu":
-                self.weight.data = self.weight.data.reshape(1, -1)
+                self.weight.data = reverse_4bit_compress_format(self.weight.data.reshape(1, -1))
 
             self.weight.quant_state.ipex = False
 
diff --git a/bitsandbytes/utils.py b/bitsandbytes/utils.py
index 02c9ac2ca..e3748685e 100644
--- a/bitsandbytes/utils.py
+++ b/bitsandbytes/utils.py
@@ -200,18 +200,35 @@ def unpack_tensor_to_dict(tensor_data):
     return unpacked_dict
 
 
+def reverse_4bit_compress_format(weight):
+    out_1 = torch.empty(weight.size(0), dtype=torch.int32, device=weight.device)
+    out_2 = torch.empty(weight.size(0), dtype=torch.int32, device=weight.device)
+    out_1 = (weight & 0xF0) >> 4
+    out_2 = (weight & 0xF) << 4
+    out = out_1 | out_2
+    return out
+
+
 def enable_ipex_fusion(linear, x):
     from bitsandbytes.backends.cpu_xpu_common import (
         _ipex_cpu_version_prereq,
         _ipex_xpu_version_prereq,
+        dequant_8bit,
         ipex_cpu,
         ipex_xpu,
     )
 
+    quant_state = linear.weight.quant_state
+
+    if quant_state.nested:
+        quant_state.absmax = dequant_8bit(quant_state.absmax, quant_state.offset, quant_state.state2)
+        quant_state.nested = False
+        delattr(quant_state, "state2")
+
     if x.device.type == "cpu" and ipex_cpu and _ipex_cpu_version_prereq(2, 5):
-        quant_state = linear.weight.quant_state
+        converted_weight = reverse_4bit_compress_format(linear.weight.data)
         new_weight, new_scales, new_zeros, _, compensation = torch.ops.ipex_prepack.woq_linear_pack_weight(
-            linear.weight.data.reshape([quant_state.shape[0], quant_state.shape[1] // 2]),
+            converted_weight.reshape([quant_state.shape[0], quant_state.shape[1] // 2]),
             "nf4",
             quant_state.shape,  # weight shape
             quant_state.absmax.view(quant_state.shape[0], quant_state.shape[1] // quant_state.blocksize),  # scales
@@ -222,12 +239,16 @@ def enable_ipex_fusion(linear, x):
             2,
         )
     elif x.device.type == "xpu" and ipex_xpu and _ipex_xpu_version_prereq(2, 5):
-        quant_state = linear.weight.quant_state
-        new_weight = linear.weight.data.reshape([quant_state.shape[0], quant_state.shape[1] // 2])
-
+        converted_weight = reverse_4bit_compress_format(linear.weight.data)
+        new_weight = converted_weight.reshape([quant_state.shape[0], quant_state.shape[1] // 2])
         new_scales = quant_state.absmax.view(quant_state.shape[0], quant_state.shape[1] // quant_state.blocksize)
         new_zeros = None
         compensation = None
+    else:
+        raise ValueError(
+            "Please check the device and ipex version. The device should be cpu or xpu while ipex version should >= 2.5"
+        )
+
     linear.weight.data = new_weight.data
     linear.weight.quant_state.ipex = True
     linear.weight.quant_state.new_scales = new_scales

From 307fbd52bdc3734130b505781b72a1e15cf83e0c Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Wed, 29 Jan 2025 00:31:11 +0800
Subject: [PATCH 23/29] Enable dequant+matmul 8bit path for Intel CPU and XPU
 (#1484)

* new matmul8bit

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* fix cxb

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

---------

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 bitsandbytes/autograd/_functions.py | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
index 6440ab1b5..9de5a8924 100644
--- a/bitsandbytes/autograd/_functions.py
+++ b/bitsandbytes/autograd/_functions.py
@@ -563,6 +563,29 @@ def backward(ctx, grad_output):
         return grad_A, grad_B, None, grad_bias, None
 
 
+class MatMul8bitFp(torch.autograd.Function):
+    # For Intel CPU and XPU, the double quant has many unsafe operations which will breaks the finetune.
+    # We'd like to use dequant + matmul to run finetune currently.
+
+    @staticmethod
+    def forward(ctx, A, B, out=None, bias=None, state=MatmulLtState):
+        CB = B.data.to(A.dtype).mul_(state.SCB.unsqueeze(1).mul(1.0 / 127.0)).t()
+        output = torch.matmul(A, CB).to(A.dtype)
+        ctx.state = state
+        ctx.dtype_A = A.dtype
+        ctx.grad_shape = A.shape
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        state = ctx.state
+        B = state.CxB if state.CxB is not None else state.CB
+        CB = B.to(ctx.dtype_A).mul_(state.SCB.unsqueeze(1).mul(1.0 / 127.0))
+        grad_A = torch.matmul(grad_output, CB).view(ctx.grad_shape).to(ctx.dtype_A)
+
+        return grad_A, None, None, None, None
+
+
 def matmul(
     A: torch.Tensor,
     B: torch.Tensor,
@@ -574,6 +597,8 @@ def matmul(
     state = state or MatmulLtState()
     if threshold > 0.0:
         state.threshold = threshold
+    if A.device.type in ("cpu", "xpu") and state.is_training:
+        return MatMul8bitFp.apply(A, B, out, bias, state)
     return MatMul8bitLt.apply(A, B, out, bias, state)
 
 

From a0a95fd70e1c988043b7a674b882841ba21bf4be Mon Sep 17 00:00:00 2001
From: Fanli Lin <fanli0116@gmail.com>
Date: Wed, 29 Jan 2025 00:34:39 +0800
Subject: [PATCH 24/29] add device index (#1489)

---
 bitsandbytes/nn/modules.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 2320ffd39..81404179d 100755
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -660,9 +660,9 @@ def cpu(self):
         self.SCB = SCB
         return self
 
-    def xpu(self):
+    def xpu(self, device):
         # we store the 8-bit rows-major weight
-        B = self.data.contiguous().to(torch.float16).xpu()
+        B = self.data.contiguous().to(torch.float16).xpu(device)
         CB, CBt, SCB, SCBt, coo_tensorB = bnb.functional.double_quant(B)
         if CBt is not None:
             del CBt
@@ -700,11 +700,11 @@ def to(self, *args, **kwargs):
                 return self.cpu()
         elif device.type == "xpu":
             if self.data.dtype == torch.int8:
-                self.data = self.data.contiguous().xpu()
+                self.data = self.data.contiguous().xpu(device)
                 self.CB = self.data
                 return self
             else:
-                return self.xpu()
+                return self.xpu(device)
         else:
             new_param = Int8Params(
                 super().to(device=device, dtype=dtype, non_blocking=non_blocking),

From 3ac9d38b7f1af66603eecb5c816ef6bd72fa3ff2 Mon Sep 17 00:00:00 2001
From: Lzy17 <joeylzy0710@gmail.com>
Date: Fri, 7 Feb 2025 15:52:55 +0000
Subject: [PATCH 25/29] IFU-master-2025-02-07

---
 conflicts.diff | 93 --------------------------------------------------
 1 file changed, 93 deletions(-)
 delete mode 100644 conflicts.diff

diff --git a/conflicts.diff b/conflicts.diff
deleted file mode 100644
index d49244f34..000000000
--- a/conflicts.diff
+++ /dev/null
@@ -1,93 +0,0 @@
-diff --cc docs/source/installation.mdx
-index 146fb0d,7961385..0000000
---- a/docs/source/installation.mdx
-+++ b/docs/source/installation.mdx
-@@@ -165,15 -243,80 +243,87 @@@ apt-get update && apt-get install -y gi
-  
-  # Install pytorch compatible with above ROCm version
-  pip install torch --index-url https://download.pytorch.org/whl/rocm6.1/
-+ ```
-  
-++<<<<<<< HEAD
- +# Install bitsandbytes from PyPI
- +# (This is supported on Ubuntu 22.04, Python 3.10, ROCm 6.1.0/6.1.1/6.1.2/6.2.0 and gpu arch - gfx90a, gfx942, gfx1100
- +# Please install from source if your configuration doesn't match with these)
- +pip install bitsandbytes
-++=======
-+ </hfoption>
-+ <hfoption id="Intel CPU + GPU">
-++>>>>>>> upstream/multi-backend-refactor
-+ 
-+ Compatible hardware and functioning `import intel_extension_for_pytorch as ipex` capable environment with Python `3.10` as the minimum requirement.
-+ 
-+ Please refer to [the official Intel installations instructions](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=cpu&version=v2.4.0%2bcpu&os=linux%2fwsl2) for guidance on how to pip install the necessary `intel_extension_for_pytorch` dependency.
-+ 
-+ </hfoption>
-+ <hfoption id="Ascend NPU">
-+ 
-+ Compatible hardware and functioning `import torch_npu` capable environment with Python `3.10` as the minimum requirement.
-+ 
-+ Please refer to [the official Ascend installations instructions](https://www.hiascend.com/document/detail/zh/Pytorch/60RC3/configandinstg/instg/insg_0001.html) for guidance on how to pip install the necessary `torch_npu` dependency.
-+ 
-+ </hfoption>
-+ <hfoption id="Apple Silicon (MPS)">
-  
-+ > [!TIP]
-+ > Apple Silicon support is still a WIP. Please visit and write us in [this Github Discussion space on coordinating the kickoff of MPS backend development](https://github.com/bitsandbytes-foundation/bitsandbytes/discussions/1340) and coordinate a community-led effort to implement this backend.
-+ 
-+ </hfoption>
-+ </hfoptions>
-+ 
-+ ### Installation
-+ 
-+ You can install the pre-built wheels for each backend, or compile from source for custom configurations.
-+ 
-+ #### Pre-built Wheel Installation (recommended)[[multi-backend-pip]]
-+ 
-+ <hfoptions id="platform">
-+ <hfoption id="Linux">
-+ 
-+ ```
-+ # Note, if you don't want to reinstall BNBs dependencies, append the `--no-deps` flag!
-+ pip install --force-reinstall 'https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_multi-backend-refactor/bitsandbytes-0.44.1.dev0-py3-none-manylinux_2_24_x86_64.whl'
-+ ```
-+ 
-+ </hfoption>
-+ <hfoption id="Windows">
-+ 
-+ ```
-+ # Note, if you don't want to reinstall BNBs dependencies, append the `--no-deps` flag!
-+ pip install --force-reinstall 'https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_multi-backend-refactor/bitsandbytes-0.44.1.dev0-py3-none-win_amd64.whl'
-+ ```
-+ 
-+ </hfoption>
-+ <hfoption id="Mac">
-+ 
-+ > [!WARNING]
-+ > bitsandbytes does not yet support Apple Silicon / Metal with a dedicated backend. However, the build infrastructure is in place and the below pip install will eventually provide Apple Silicon support as it becomes available on the `multi-backend-refactor` branch based on community contributions.
-+ 
-+ ```
-+ # Note, if you don't want to reinstall BNBs dependencies, append the `--no-deps` flag!
-+ pip install --force-reinstall 'https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_multi-backend-refactor/bitsandbytes-0.44.1.dev0-py3-none-macosx_13_1_arm64.whl'
-+ ```
-+ 
-+ </hfoption>
-+ </hfoptions>
-+ 
-+ #### Compile from Source[[multi-backend-compile]]
-+ 
-+ <hfoptions id="backend">
-+ <hfoption id="AMD ROCm">
-+ 
-+ #### AMD GPU
-+ 
-+ bitsandbytes is fully supported from ROCm 6.1 onwards (currently in alpha release).
-+ 
-+ ```bash
-  # Install bitsandbytes from source
-  # Clone bitsandbytes repo, ROCm backend is currently enabled on multi-backend-refactor branch
-- git clone --depth 1 -b multi-backend-refactor https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
-+ git clone -b multi-backend-refactor https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
-  
-  # Install dependencies
-  pip install -r requirements-dev.txt

From 057bf4e0733eafbaf831818788b51f304a063ede Mon Sep 17 00:00:00 2001
From: Lzy17 <joeylzy0710@gmail.com>
Date: Thu, 13 Feb 2025 20:08:16 +0000
Subject: [PATCH 26/29] fix _functions

---
 bitsandbytes/autograd/_functions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
index 9de5a8924..5fb6c10ea 100644
--- a/bitsandbytes/autograd/_functions.py
+++ b/bitsandbytes/autograd/_functions.py
@@ -464,7 +464,7 @@ def backward(ctx, grad_output):
             grad_output = grad_output.reshape(-1, grad_output.shape[-1]).contiguous()
 
         Cgrad, Cgradt, SCgrad, SCgradt, coo_tensor = None, None, None, None, None
-        if req_gradB or (req_gradA and state.CBt):
+        if req_gradB or (req_gradA and state.CBt is not None):
             Cgrad, Cgradt, SCgrad, SCgradt, coo_tensor = F.double_quant(grad_output.to(torch.float16))
         if req_gradB:
             CxAt, SAt = F.transform(CAt, formatB, transpose=True)

From 250eb5e43094be981bc5ebd9316d77a00a12af29 Mon Sep 17 00:00:00 2001
From: Zhaoyi Li <36555117+Lzy17@users.noreply.github.com>
Date: Fri, 21 Feb 2025 06:37:36 -0600
Subject: [PATCH 27/29] Update installation.mdx

---
 docs/source/installation.mdx | 60 ++++++++++++++++++++----------------
 1 file changed, 34 insertions(+), 26 deletions(-)

diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index c96d15247..4f64f6385 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -19,29 +19,32 @@ Welcome to the installation guide for the `bitsandbytes` library! This document
 
 ## CUDA[[cuda]]
 
-`bitsandbytes` is currently only supported on CUDA GPUs for CUDA versions **11.0 - 12.5**. However, there's an ongoing multi-backend effort under development, which is currently in alpha. If you're interested in providing feedback or testing, check out [the multi-backend section below](#multi-backend).
+`bitsandbytes` is currently only supported on CUDA GPUs for CUDA versions **11.0 - 12.6**. However, there's an ongoing multi-backend effort under development, which is currently in alpha. If you're interested in providing feedback or testing, check out [the multi-backend section below](#multi-backend).
 
 ### Supported CUDA Configurations[[cuda-pip]]
 
-The latest version of `bitsandbytes` builds on the following configurations:
+The latest version of the distributed `bitsandbytes` package is built with the following configurations:
 
-| **OS**      | **CUDA Version** | **Compiler**         |
+| **OS**      | **CUDA Toolkit** | **Host Compiler**         |
 |-------------|------------------|----------------------|
 | **Linux**   | 11.7 - 12.3      | GCC 11.4             |
+|             | 12.4 - 12.6      | GCC 13.2             |
+| **Windows** | 11.7 - 12.6      | MSVC 19.42+ (VS2022) |
 |             | 12.4+            | GCC 13.2             |
-| **Windows** | 11.7 - 12.4      | MSVC 19.38+ (VS2022) |
+| **Windows** | 11.7 - 12.6      | MSVC 19.38+ (VS2022) |
 
-For Linux systems, ensure your hardware meets the following requirements:
+For CUDA systems, ensure your hardware meets the following requirements:
 
-| **Feature**                     | **Hardware Requirement**                                           |
-|---------------------------------|--------------------------------------------------------------------|
-| LLM.int8()                      | NVIDIA Turing (RTX 20 series, T4) or Ampere (RTX 30 series, A4-A100) GPUs |
-| 8-bit optimizers/quantization   | NVIDIA Kepler (GTX 780 or newer)                                    |
+| **Feature**                     | **Minimum Hardware Requirement**                              |
+|---------------------------------|---------------------------------------------------------------|
+| LLM.int8()                      | NVIDIA Turing (RTX 20 series, T4) or newer GPUs               |
+| 8-bit optimizers/quantization   | NVIDIA Maxwell (GTX 900 series, TITAN X, M40) or newer GPUs * |
+| NF4/FP4 quantization            | NVIDIA Maxwell (GTX 900 series, TITAN X, M40) or newer GPUs * |
 
 > [!WARNING]
-> `bitsandbytes >= 0.39.1` no longer includes Kepler binaries in pip installations. This requires [manual compilation using](#cuda-compile) the `cuda11x_nomatmul_kepler` configuration.
-
-To install from PyPI.
+> `bitsandbytes >= 0.45.0` no longer supports Kepler GPUs.
+>
+> Support for Maxwell GPUs is deprecated and will be removed in a future release. For the best results, a Turing generation device or newer is recommended.
 
 ```bash
 pip install bitsandbytes
@@ -79,7 +82,7 @@ For Linux and Windows systems, compiling from source allows you to customize the
 <hfoptions id="source">
 <hfoption id="Linux">
 
-To compile from source, you need CMake >= **3.22.1** and Python >= **3.8** installed. Make sure you have a compiler installed to compile C++ (`gcc`, `make`, headers, etc.).
+To compile from source, you need CMake >= **3.22.1** and Python >= **3.9** installed. Make sure you have a compiler installed to compile C++ (`gcc`, `make`, headers, etc.).
 
 For example, to install a compiler and CMake on Ubuntu:
 
@@ -115,7 +118,7 @@ pip install -e .   # `-e` for "editable" install, when developing BNB (otherwise
 
 Windows systems require Visual Studio with C++ support as well as an installation of the CUDA SDK.
 
-To compile from source, you need CMake >= **3.22.1** and Python >= **3.8** installed. You should also install CUDA Toolkit by following the [CUDA Installation Guide for Windows](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html) guide from NVIDIA.
+To compile from source, you need CMake >= **3.22.1** and Python >= **3.9** installed. You should also install CUDA Toolkit by following the [CUDA Installation Guide for Windows](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html) guide from NVIDIA.
 
 Refer to the following table if you're using another CUDA Toolkit version.
 
@@ -125,7 +128,6 @@ Refer to the following table if you're using another CUDA Toolkit version.
 
 ```bash
 git clone https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
-pip install -r requirements-dev.txt
 cmake -DCOMPUTE_BACKEND=cuda -S .
 cmake --build . --config Release
 pip install -e .   # `-e` for "editable" install, when developing BNB (otherwise leave that out)
@@ -150,12 +152,12 @@ Then locally install the CUDA version you need with this script from bitsandbyte
 ```bash
 wget https://raw.githubusercontent.com/bitsandbytes-foundation/bitsandbytes/main/install_cuda.sh
 # Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH
-#   CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122, 123, 124, 125}
+#   CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122, 123, 124, 125, 126}
 #   EXPORT_TO_BASH in {0, 1} with 0=False and 1=True
 
-# For example, the following installs CUDA 11.7 to ~/local/cuda-11.7 and exports the path to your .bashrc
+# For example, the following installs CUDA 12.6 to ~/local/cuda-12.6 and exports the path to your .bashrc
 
-bash install_cuda.sh 117 ~/local 1
+bash install_cuda.sh 126 ~/local 1
 ```
 
 2. Set the environment variables `BNB_CUDA_VERSION` and `LD_LIBRARY_PATH` by manually overriding the CUDA version installed by PyTorch.
@@ -171,8 +173,8 @@ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<PATH>
 For example, to use a local install path:
 
 ```bash
-export BNB_CUDA_VERSION=117
-export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/YOUR_USERNAME/local/cuda-11.7
+export BNB_CUDA_VERSION=126
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/YOUR_USERNAME/local/cuda-12.6
 ```
 
 3. Now when you launch bitsandbytes with these environment variables, the PyTorch CUDA version is overridden by the new CUDA version (in this example, version 11.7) and a different bitsandbytes library is loaded.
@@ -208,8 +210,8 @@ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/YOUR_USERNAME/local/cuda-11.7
 |-------------|------------------------|---------------------------|-------------------------|------------|
 | **AMD ROCm** | 6.1+                   | 3.10+                     | minimum CDNA - `gfx90a`, RDNA - `gfx1100` | Alpha      |
 | **Apple Silicon (MPS)** | WIP                        | 3.10+                     | M1/M2 chips                    | Planned    |
-| **Intel CPU** | v2.5.0+ (`ipex`)         | 3.10+                     | Intel CPU | Alpha |
-| **Intel GPU** | v2.5.0+ (`ipex`)         | 3.10+                     | Intel GPU | Experimental |
+| **Intel CPU** | v2.4.0+ (`ipex`)         | 3.10+                     | Intel CPU | Alpha |
+| **Intel GPU** | v2.4.0+ (`ipex`)         | 3.10+                     | Intel GPU | Experimental |
 | **Ascend NPU** | 2.1.0+ (`torch_npu`)         | 3.10+                     | Ascend NPU | Experimental |
 
 For each supported backend, follow the respective instructions below:
@@ -245,10 +247,6 @@ apt-get update && apt-get install -y git && cd home
 pip install torch --index-url https://download.pytorch.org/whl/rocm6.1/
 ```
 
-# Install bitsandbytes from PyPI
-# (This is supported on Ubuntu 22.04, Python 3.10, ROCm 6.1.0/6.1.1/6.1.2/6.2.0 and gpu arch - gfx90a, gfx942, gfx1100
-# Please install from source if your configuration doesn't match with these)
-pip install bitsandbytes
 </hfoption>
 <hfoption id="Intel CPU + GPU">
 
@@ -294,6 +292,14 @@ pip install --force-reinstall 'https://github.com/bitsandbytes-foundation/bitsan
 pip install --force-reinstall 'https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_multi-backend-refactor/bitsandbytes-0.44.1.dev0-py3-none-win_amd64.whl'
 ```
 
+</hfoption>
+
+<hfoption id="Ascend NPU">
+
+Compatible hardware and functioning `import torch_npu` capable environment with Python `3.10` as the minimum requirement.
+
+Please refer to [the official Ascend installations instructions](https://www.hiascend.com/document/detail/zh/Pytorch/60RC3/configandinstg/instg/insg_0001.html) for guidance on how to pip install the necessary `torch_npu` dependency.
+
 </hfoption>
 <hfoption id="Mac">
 
@@ -348,6 +354,8 @@ The below commands are for Linux. For installing on Windows, please adapt the be
 git clone --depth 1 -b multi-backend-refactor https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
 pip install intel_extension_for_pytorch
 pip install -r requirements-dev.txt
+cmake -DCOMPUTE_BACKEND=cpu -S .
+make
 pip install -e .   # `-e` for "editable" install, when developing BNB (otherwise leave that out)
 ```
 

From a4e685295981c0f59a223562f1de012c32e8f040 Mon Sep 17 00:00:00 2001
From: Zhaoyi Li <36555117+Lzy17@users.noreply.github.com>
Date: Fri, 21 Feb 2025 06:40:40 -0600
Subject: [PATCH 28/29] Update installation.mdx to the correct version

---
 docs/source/installation.mdx | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index 4f64f6385..da4ab3b44 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -254,13 +254,6 @@ Compatible hardware and functioning `import intel_extension_for_pytorch as ipex`
 
 Please refer to [the official Intel installations instructions](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=cpu&version=v2.4.0%2bcpu&os=linux%2fwsl2) for guidance on how to pip install the necessary `intel_extension_for_pytorch` dependency.
 
-</hfoption>
-<hfoption id="Ascend NPU">
-
-Compatible hardware and functioning `import torch_npu` capable environment with Python `3.10` as the minimum requirement.
-
-Please refer to [the official Ascend installations instructions](https://www.hiascend.com/document/detail/zh/Pytorch/60RC3/configandinstg/instg/insg_0001.html) for guidance on how to pip install the necessary `torch_npu` dependency.
-
 </hfoption>
 <hfoption id="Apple Silicon (MPS)">
 
@@ -293,7 +286,6 @@ pip install --force-reinstall 'https://github.com/bitsandbytes-foundation/bitsan
 ```
 
 </hfoption>
-
 <hfoption id="Ascend NPU">
 
 Compatible hardware and functioning `import torch_npu` capable environment with Python `3.10` as the minimum requirement.

From 6dedda46ab061183d4a7eaa5eb9c56b3f0061506 Mon Sep 17 00:00:00 2001
From: Zhaoyi Li <36555117+Lzy17@users.noreply.github.com>
Date: Fri, 21 Feb 2025 06:49:07 -0600
Subject: [PATCH 29/29] Fix installation.mdx

---
 docs/source/installation.mdx | 62 +++++++++++++++++-------------------
 1 file changed, 29 insertions(+), 33 deletions(-)

diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index da4ab3b44..79613856f 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -19,32 +19,29 @@ Welcome to the installation guide for the `bitsandbytes` library! This document
 
 ## CUDA[[cuda]]
 
-`bitsandbytes` is currently only supported on CUDA GPUs for CUDA versions **11.0 - 12.6**. However, there's an ongoing multi-backend effort under development, which is currently in alpha. If you're interested in providing feedback or testing, check out [the multi-backend section below](#multi-backend).
+`bitsandbytes` is currently only supported on CUDA GPUs for CUDA versions **11.0 - 12.5**. However, there's an ongoing multi-backend effort under development, which is currently in alpha. If you're interested in providing feedback or testing, check out [the multi-backend section below](#multi-backend).
 
 ### Supported CUDA Configurations[[cuda-pip]]
 
-The latest version of the distributed `bitsandbytes` package is built with the following configurations:
+The latest version of `bitsandbytes` builds on the following configurations:
 
-| **OS**      | **CUDA Toolkit** | **Host Compiler**         |
+| **OS**      | **CUDA Version** | **Compiler**         |
 |-------------|------------------|----------------------|
 | **Linux**   | 11.7 - 12.3      | GCC 11.4             |
-|             | 12.4 - 12.6      | GCC 13.2             |
-| **Windows** | 11.7 - 12.6      | MSVC 19.42+ (VS2022) |
 |             | 12.4+            | GCC 13.2             |
-| **Windows** | 11.7 - 12.6      | MSVC 19.38+ (VS2022) |
+| **Windows** | 11.7 - 12.4      | MSVC 19.38+ (VS2022) |
 
-For CUDA systems, ensure your hardware meets the following requirements:
+For Linux systems, ensure your hardware meets the following requirements:
 
-| **Feature**                     | **Minimum Hardware Requirement**                              |
-|---------------------------------|---------------------------------------------------------------|
-| LLM.int8()                      | NVIDIA Turing (RTX 20 series, T4) or newer GPUs               |
-| 8-bit optimizers/quantization   | NVIDIA Maxwell (GTX 900 series, TITAN X, M40) or newer GPUs * |
-| NF4/FP4 quantization            | NVIDIA Maxwell (GTX 900 series, TITAN X, M40) or newer GPUs * |
+| **Feature**                     | **Hardware Requirement**                                           |
+|---------------------------------|--------------------------------------------------------------------|
+| LLM.int8()                      | NVIDIA Turing (RTX 20 series, T4) or Ampere (RTX 30 series, A4-A100) GPUs |
+| 8-bit optimizers/quantization   | NVIDIA Kepler (GTX 780 or newer)                                    |
 
 > [!WARNING]
-> `bitsandbytes >= 0.45.0` no longer supports Kepler GPUs.
->
-> Support for Maxwell GPUs is deprecated and will be removed in a future release. For the best results, a Turing generation device or newer is recommended.
+> `bitsandbytes >= 0.39.1` no longer includes Kepler binaries in pip installations. This requires [manual compilation using](#cuda-compile) the `cuda11x_nomatmul_kepler` configuration.
+
+To install from PyPI.
 
 ```bash
 pip install bitsandbytes
@@ -82,7 +79,7 @@ For Linux and Windows systems, compiling from source allows you to customize the
 <hfoptions id="source">
 <hfoption id="Linux">
 
-To compile from source, you need CMake >= **3.22.1** and Python >= **3.9** installed. Make sure you have a compiler installed to compile C++ (`gcc`, `make`, headers, etc.).
+To compile from source, you need CMake >= **3.22.1** and Python >= **3.8** installed. Make sure you have a compiler installed to compile C++ (`gcc`, `make`, headers, etc.).
 
 For example, to install a compiler and CMake on Ubuntu:
 
@@ -118,7 +115,7 @@ pip install -e .   # `-e` for "editable" install, when developing BNB (otherwise
 
 Windows systems require Visual Studio with C++ support as well as an installation of the CUDA SDK.
 
-To compile from source, you need CMake >= **3.22.1** and Python >= **3.9** installed. You should also install CUDA Toolkit by following the [CUDA Installation Guide for Windows](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html) guide from NVIDIA.
+To compile from source, you need CMake >= **3.22.1** and Python >= **3.8** installed. You should also install CUDA Toolkit by following the [CUDA Installation Guide for Windows](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html) guide from NVIDIA.
 
 Refer to the following table if you're using another CUDA Toolkit version.
 
@@ -128,6 +125,7 @@ Refer to the following table if you're using another CUDA Toolkit version.
 
 ```bash
 git clone https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
+pip install -r requirements-dev.txt
 cmake -DCOMPUTE_BACKEND=cuda -S .
 cmake --build . --config Release
 pip install -e .   # `-e` for "editable" install, when developing BNB (otherwise leave that out)
@@ -152,12 +150,12 @@ Then locally install the CUDA version you need with this script from bitsandbyte
 ```bash
 wget https://raw.githubusercontent.com/bitsandbytes-foundation/bitsandbytes/main/install_cuda.sh
 # Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH
-#   CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122, 123, 124, 125, 126}
+#   CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122, 123, 124, 125}
 #   EXPORT_TO_BASH in {0, 1} with 0=False and 1=True
 
-# For example, the following installs CUDA 12.6 to ~/local/cuda-12.6 and exports the path to your .bashrc
+# For example, the following installs CUDA 11.7 to ~/local/cuda-11.7 and exports the path to your .bashrc
 
-bash install_cuda.sh 126 ~/local 1
+bash install_cuda.sh 117 ~/local 1
 ```
 
 2. Set the environment variables `BNB_CUDA_VERSION` and `LD_LIBRARY_PATH` by manually overriding the CUDA version installed by PyTorch.
@@ -173,8 +171,8 @@ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<PATH>
 For example, to use a local install path:
 
 ```bash
-export BNB_CUDA_VERSION=126
-export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/YOUR_USERNAME/local/cuda-12.6
+export BNB_CUDA_VERSION=117
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/YOUR_USERNAME/local/cuda-11.7
 ```
 
 3. Now when you launch bitsandbytes with these environment variables, the PyTorch CUDA version is overridden by the new CUDA version (in this example, version 11.7) and a different bitsandbytes library is loaded.
@@ -210,8 +208,8 @@ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/YOUR_USERNAME/local/cuda-12.6
 |-------------|------------------------|---------------------------|-------------------------|------------|
 | **AMD ROCm** | 6.1+                   | 3.10+                     | minimum CDNA - `gfx90a`, RDNA - `gfx1100` | Alpha      |
 | **Apple Silicon (MPS)** | WIP                        | 3.10+                     | M1/M2 chips                    | Planned    |
-| **Intel CPU** | v2.4.0+ (`ipex`)         | 3.10+                     | Intel CPU | Alpha |
-| **Intel GPU** | v2.4.0+ (`ipex`)         | 3.10+                     | Intel GPU | Experimental |
+| **Intel CPU** | v2.5.0+ (`ipex`)         | 3.10+                     | Intel CPU | Alpha |
+| **Intel GPU** | v2.5.0+ (`ipex`)         | 3.10+                     | Intel GPU | Experimental |
 | **Ascend NPU** | 2.1.0+ (`torch_npu`)         | 3.10+                     | Ascend NPU | Experimental |
 
 For each supported backend, follow the respective instructions below:
@@ -254,6 +252,13 @@ Compatible hardware and functioning `import intel_extension_for_pytorch as ipex`
 
 Please refer to [the official Intel installations instructions](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=cpu&version=v2.4.0%2bcpu&os=linux%2fwsl2) for guidance on how to pip install the necessary `intel_extension_for_pytorch` dependency.
 
+</hfoption>
+<hfoption id="Ascend NPU">
+
+Compatible hardware and functioning `import torch_npu` capable environment with Python `3.10` as the minimum requirement.
+
+Please refer to [the official Ascend installations instructions](https://www.hiascend.com/document/detail/zh/Pytorch/60RC3/configandinstg/instg/insg_0001.html) for guidance on how to pip install the necessary `torch_npu` dependency.
+
 </hfoption>
 <hfoption id="Apple Silicon (MPS)">
 
@@ -285,13 +290,6 @@ pip install --force-reinstall 'https://github.com/bitsandbytes-foundation/bitsan
 pip install --force-reinstall 'https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_multi-backend-refactor/bitsandbytes-0.44.1.dev0-py3-none-win_amd64.whl'
 ```
 
-</hfoption>
-<hfoption id="Ascend NPU">
-
-Compatible hardware and functioning `import torch_npu` capable environment with Python `3.10` as the minimum requirement.
-
-Please refer to [the official Ascend installations instructions](https://www.hiascend.com/document/detail/zh/Pytorch/60RC3/configandinstg/instg/insg_0001.html) for guidance on how to pip install the necessary `torch_npu` dependency.
-
 </hfoption>
 <hfoption id="Mac">
 
@@ -346,8 +344,6 @@ The below commands are for Linux. For installing on Windows, please adapt the be
 git clone --depth 1 -b multi-backend-refactor https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
 pip install intel_extension_for_pytorch
 pip install -r requirements-dev.txt
-cmake -DCOMPUTE_BACKEND=cpu -S .
-make
 pip install -e .   # `-e` for "editable" install, when developing BNB (otherwise leave that out)
 ```