switch to llama.cpp fork and llama : expose C API to get layer device type

okaris · okaris · commit d214754bf095 · 2025-06-26T13:43:44.000Z
diff --git a/.gitmodules b/.gitmodules
@@ -1,3 +1,3 @@
 [submodule "vendor/llama.cpp"]
 	path = vendor/llama.cpp
-	url = https://github.com/ggerganov/llama.cpp.git
+	url = http://github.com/inference-sh/llama.cpp
diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
@@ -2,6 +2,7 @@
 
 import os
 import ctypes
+from enum import Enum
 
 from typing import (
     Dict,
@@ -24,7 +25,13 @@
 
 
 # Python wrappers over llama.h structs
-
+class LlamaBackendDev(Enum):
+    # CPU device using system memory
+    CPU = 0
+    # GPU device using dedicated memory  
+    GPU = 1
+    # accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
+    ACCEL = 2
 
 class LlamaModel:
     """Intermediate Python wrapper for a llama.cpp llama_model.
@@ -88,6 +95,12 @@ def n_ctx_train(self) -> int:
 
     def n_embd(self) -> int:
         return llama_cpp.llama_n_embd(self.model)
+    
+    def n_layer(self) -> int:
+        return llama_cpp.llama_n_layer(self.model)
+    
+    def dev_layer(self, il: int) -> LlamaBackendDev:
+        return LlamaBackendDev(llama_cpp.llama_model_dev_layer(self.model, il))
 
     def rope_freq_scale_train(self) -> float:
         return llama_cpp.llama_model_rope_freq_scale_train(self.model)
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -490,6 +490,13 @@ def ctx(self) -> llama_cpp.llama_context_p:
     @property
     def model(self) -> llama_cpp.llama_model_p:
         return self._model.model
+    
+    @property
+    def n_layer(self) -> int:
+        return self._model.n_layer()
+    
+    def dev_layer(self, il: int) -> internals.LlamaBackendDev:
+        return self._model.dev_layer(il)
 
     @property
     def _input_ids(self) -> npt.NDArray[np.intc]: