port kv_cache to new memory

okaris · okaris · commit 525255a45f26 · 2025-06-26T13:43:44.000Z
diff --git a/examples/notebooks/Batching.ipynb b/examples/notebooks/Batching.ipynb
@@ -230,7 +230,7 @@
    "outputs": [],
    "source": [
     "for i in range(n_parallel):\n",
-    "    llama_cpp.llama_kv_cache_seq_cp(ctx, 0, i, 0, batch.n_tokens)"
+    "    llama_cpp.llama_kv_self_seq_cp(ctx, 0, i, 0, batch.n_tokens)"
    ]
   },
   {
diff --git a/llama_cpp/_ctypes_extensions.py b/llama_cpp/_ctypes_extensions.py
@@ -5,6 +5,8 @@
 import ctypes
 import functools
 import pathlib
+import logging
+import traceback
 
 from typing import (
     Any,
@@ -18,6 +20,9 @@
 )
 from typing_extensions import TypeAlias
 
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger("llama_cpp.binding")
 
 # Load the library
 def load_shared_library(lib_base_name: str, base_path: pathlib.Path):
@@ -110,11 +115,21 @@ def ctypes_function(
     ):
         def decorator(f: F) -> F:
             if enabled:
+                print(f"Setting up binding for C function: {name}")  # Print when binding is created
                 func = getattr(lib, name)
                 func.argtypes = argtypes
                 func.restype = restype
-                functools.wraps(f)(func)
-                return func
+
+                @functools.wraps(f)
+                def wrapper(*args, **kwargs):
+                    print(f">>> Calling {name} with args: {args}")  # Print right before C call
+                    sys.stdout.flush()  # Force flush to ensure we see the output
+                    result = func(*args, **kwargs)
+                    print(f"<<< {name} returned successfully")  # Print after successful return
+                    sys.stdout.flush()
+                    return result
+
+                return wrapper
             else:
                 return f
 
diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
@@ -289,20 +289,20 @@ def n_ctx(self) -> int:
     def pooling_type(self) -> int:
         return llama_cpp.llama_pooling_type(self.ctx)
 
-    def kv_cache_clear(self):
-        llama_cpp.llama_kv_cache_clear(self.ctx)
+    def kv_self_clear(self):
+        llama_cpp.llama_kv_self_clear(self.ctx)
 
-    def kv_cache_seq_rm(self, seq_id: int, p0: int, p1: int):
-        llama_cpp.llama_kv_cache_seq_rm(self.ctx, seq_id, p0, p1)
+    def kv_self_seq_rm(self, seq_id: int, p0: int, p1: int):
+        llama_cpp.llama_kv_self_seq_rm(self.ctx, seq_id, p0, p1)
 
-    def kv_cache_seq_cp(self, seq_id_src: int, seq_id_dst: int, p0: int, p1: int):
-        llama_cpp.llama_kv_cache_seq_cp(self.ctx, seq_id_src, seq_id_dst, p0, p1)
+    def kv_self_seq_cp(self, seq_id_src: int, seq_id_dst: int, p0: int, p1: int):
+        llama_cpp.llama_kv_self_seq_cp(self.ctx, seq_id_src, seq_id_dst, p0, p1)
 
-    def kv_cache_seq_keep(self, seq_id: int):
-        llama_cpp.llama_kv_cache_seq_keep(self.ctx, seq_id)
+    def kv_self_seq_keep(self, seq_id: int):
+        llama_cpp.llama_kv_self_seq_keep(self.ctx, seq_id)
 
-    def kv_cache_seq_shift(self, seq_id: int, p0: int, p1: int, shift: int):
-        llama_cpp.llama_kv_cache_seq_add(self.ctx, seq_id, p0, p1, shift)
+    def kv_self_seq_shift(self, seq_id: int, p0: int, p1: int, shift: int):
+        llama_cpp.llama_kv_self_seq_add(self.ctx, seq_id, p0, p1, shift)
 
     def get_state_size(self) -> int:
         return llama_cpp.llama_get_state_size(self.ctx)
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -581,7 +581,7 @@ def eval(self, tokens: Sequence[int]):
         Args:
             tokens: The list of tokens to evaluate.
         """
-        self._ctx.kv_cache_seq_rm(-1, self.n_tokens, -1)
+        self._ctx.kv_self_seq_rm(-1, self.n_tokens, -1)
         for i in range(0, len(tokens), self.n_batch):
             batch = tokens[i : min(len(tokens), i + self.n_batch)]
             n_past = self.n_tokens
@@ -889,7 +889,7 @@ def generate(
 
                 if sample_idx < self.n_tokens and token != self._input_ids[sample_idx]:
                     self.n_tokens = sample_idx
-                    self._ctx.kv_cache_seq_rm(-1, self.n_tokens, -1)
+                    self._ctx.kv_self_seq_rm(-1, self.n_tokens, -1)
                     break
 
             if self.draft_model is not None:
@@ -985,7 +985,7 @@ def embed(
         data: Union[List[List[float]], List[List[List[float]]]] = []
 
         def decode_batch(seq_sizes: List[int]):
-            llama_cpp.llama_kv_cache_clear(self._ctx.ctx)
+            llama_cpp.llama_kv_self_clear(self._ctx.ctx)
             self._ctx.decode(self._batch)
             self._batch.reset()
 
@@ -1056,7 +1056,7 @@ def decode_batch(seq_sizes: List[int]):
 
         output = data[0] if isinstance(input, str) else data
 
-        llama_cpp.llama_kv_cache_clear(self._ctx.ctx)
+        llama_cpp.llama_kv_self_clear(self._ctx.ctx)
         self.reset()
 
         if return_count:
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
@@ -2847,7 +2847,7 @@ def __call__(
 
         # Evaluate prompt
         llama.reset()
-        llama._ctx.kv_cache_clear()
+        llama._ctx.kv_self_clear()
         for type_, value in split_text:
             if type_ == "text":
                 tokens = llama.tokenize(
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
diff --git a/vendor/llama.cpp b/vendor/llama.cpp

Original file line number	Diff line number	Diff line change
`@@ -230,7 +230,7 @@`
`230`	`230`	`"outputs": [],`
`231`	`231`	`"source": [`
`232`	`232`	`"for i in range(n_parallel):\n",`
`233`		`- " llama_cpp.llama_kv_cache_seq_cp(ctx, 0, i, 0, batch.n_tokens)"`
	`233`	`+ " llama_cpp.llama_kv_self_seq_cp(ctx, 0, i, 0, batch.n_tokens)"`
`234`	`234`	`]`
`235`	`235`	`},`
`236`	`236`	`{`