migrate llava to mtmd

okaris · okaris · commit 1d23ae0069da · 2025-06-26T13:43:44.000Z
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 3.21)
 project(llama_cpp)
 
 option(LLAMA_BUILD "Build llama.cpp shared library and install alongside python package" ON)
-option(LLAVA_BUILD "Build llava shared library and install alongside python package" ON)
+option(MTMD_BUILD "Build multimodal (mtmd) shared library and install alongside python package" ON)
 
 function(llama_cpp_python_install_target target)
     if(NOT TARGET ${target})
@@ -135,7 +135,7 @@ if (LLAMA_BUILD)
         )
     endif()
 
-    if (LLAVA_BUILD)
+    if (MTMD_BUILD)
         if (LLAMA_CUBLAS OR LLAMA_CUDA)
             add_compile_definitions(GGML_USE_CUBLAS)
             add_compile_definitions(GGML_USE_CUDA)
@@ -145,36 +145,26 @@ if (LLAMA_BUILD)
             add_compile_definitions(GGML_USE_METAL)
         endif()
 
-        # Building llava
+        # Building multimodal support using mtmd
         add_subdirectory(vendor/llama.cpp/tools/mtmd)
-        set_target_properties(llava_shared PROPERTIES OUTPUT_NAME "llava")
 
         if (WIN32)
-            set_target_properties(llava_shared PROPERTIES CUDA_ARCHITECTURES OFF)
+            set_target_properties(mtmd PROPERTIES CUDA_ARCHITECTURES OFF)
         endif()
-        llama_cpp_python_install_target(llava_shared)
+        llama_cpp_python_install_target(mtmd)
         if (WIN32)
             install(
-                FILES $<TARGET_RUNTIME_DLLS:llava_shared>
+                FILES $<TARGET_RUNTIME_DLLS:mtmd>
                 DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib
             )
             install(
-                FILES $<TARGET_RUNTIME_DLLS:llava_shared>
+                FILES $<TARGET_RUNTIME_DLLS:mtmd>
                 DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib
             )
         endif()
 
-        # Fix for llava build: Add include directory for llama.h
-        # Move these commands after the add_subdirectory call
-        target_include_directories(llava PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include)
-        target_include_directories(llava PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/ggml/include)
-
-        if (BUILD_SHARED_LIBS)
-            target_include_directories(llava_shared PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include)
-            target_include_directories(llava_shared PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/ggml/include)
-        endif()
-
-        target_include_directories(llama-llava-cli PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include)
-        target_include_directories(llama-minicpmv-cli PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include)
+        # Add include directories for mtmd
+        target_include_directories(mtmd PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include)
+        target_include_directories(mtmd PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/ggml/include)
     endif()
 endif()
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
@@ -2714,23 +2714,23 @@ class Llava15ChatHandler:
     )
 
     def __init__(self, clip_model_path: str, verbose: bool = True):
-        import llama_cpp.llava_cpp as llava_cpp
+        import llama_cpp.mtmd_cpp as mtmd_cpp
 
         self.clip_model_path = clip_model_path
         self.verbose = verbose
 
-        self._llava_cpp = llava_cpp  # TODO: Fix
+        self._mtmd_cpp = mtmd_cpp  # TODO: Fix
         self._exit_stack = ExitStack()
         self._last_image_embed: Optional[
-            llava_cpp.CtypesPointer[llava_cpp.llava_image_embed]
+            mtmd_cpp.CtypesPointer[mtmd_cpp.mtmd_cpp_image_embed]
         ] = None
         self._last_image_hash: Optional[int] = None
 
         if not os.path.exists(clip_model_path):
             raise ValueError(f"Clip model path does not exist: {clip_model_path}")
 
         with suppress_stdout_stderr(disable=self.verbose):
-            clip_ctx = self._llava_cpp.clip_model_load(self.clip_model_path.encode(), 0)
+            clip_ctx = self._mtmd_cpp.clip_model_load(self.clip_model_path.encode(), 0)
 
             if clip_ctx is None:
                 raise ValueError(f"Failed to load clip model: {clip_model_path}")
@@ -2739,14 +2739,14 @@ def __init__(self, clip_model_path: str, verbose: bool = True):
 
             def clip_free():
                 with suppress_stdout_stderr(disable=self.verbose):
-                    self._llava_cpp.clip_free(self.clip_ctx)
+                    self._mtmd_cpp.clip_free(self.clip_ctx)
 
             self._exit_stack.callback(clip_free)
 
         def last_image_embed_free():
             with suppress_stdout_stderr(disable=self.verbose):
                 if self._last_image_embed is not None:
-                    self._llava_cpp.llava_image_embed_free(self._last_image_embed)
+                    self._mtmd_cpp.mtmd_cpp_image_embed_free(self._last_image_embed)
                     self._last_image_embed = None
 
         self._exit_stack.callback(last_image_embed_free)
@@ -2764,10 +2764,10 @@ def _embed_image_bytes(self, image_bytes: bytes, n_threads_batch: int = 1):
         with suppress_stdout_stderr(disable=self.verbose):
             # Free the previous image embed
             if self._last_image_embed is not None:
-                self._llava_cpp.llava_image_embed_free(self._last_image_embed)
+                self._mtmd_cpp.mtmd_cpp_image_embed_free(self._last_image_embed)
                 self._last_image_embed = None
                 self._last_image_hash = None
-            embed = self._llava_cpp.llava_image_embed_make_with_bytes(
+            embed = self._mtmd_cpp.mtmd_cpp_image_embed_make_with_bytes(
                 self.clip_ctx,
                 n_threads_batch,
                 (ctypes.c_uint8 * len(image_bytes)).from_buffer(
@@ -2955,7 +2955,7 @@ def eval_image(self, llama: llama.Llama, image_url: str):
         n_past = ctypes.c_int(llama.n_tokens)
         n_past_p = ctypes.pointer(n_past)
         with suppress_stdout_stderr(disable=self.verbose):
-            self._llava_cpp.llava_eval_image_embed(
+            self._mtmd_cpp.mtmd_cpp_eval_image_embed(
                 llama.ctx,
                 embed,
                 llama.n_batch,
@@ -3483,30 +3483,30 @@ def eval_image(self, llama: llama.Llama, image_url: str):
             )
 
         img_bytes = self.load_image(image_url)
-        img_u8_p = self._llava_cpp.clip_image_u8_init()
-        if not self._llava_cpp.clip_image_load_from_bytes(
+        img_u8_p = self._mtmd_cpp.clip_image_u8_init()
+        if not self._mtmd_cpp.clip_image_load_from_bytes(
             ctypes.create_string_buffer(img_bytes, len(img_bytes)),
             ctypes.c_size_t(len(img_bytes)),
             img_u8_p,
         ):
-            self._llava_cpp.clip_image_u8_free(img_u8_p)
+            self._mtmd_cpp.clip_image_u8_free(img_u8_p)
             raise ValueError("Failed to load image.")
 
-        img_f32_p = self._llava_cpp.clip_image_f32_batch_init()
-        if not self._llava_cpp.clip_image_preprocess(self.clip_ctx, img_u8_p, img_f32_p):
-            self._llava_cpp.clip_image_f32_batch_free(img_f32_p)
-            self._llava_cpp.clip_image_u8_free(img_u8_p)
+        img_f32_p = self._mtmd_cpp.clip_image_f32_batch_init()
+        if not self._mtmd_cpp.clip_image_preprocess(self.clip_ctx, img_u8_p, img_f32_p):
+            self._mtmd_cpp.clip_image_f32_batch_free(img_f32_p)
+            self._mtmd_cpp.clip_image_u8_free(img_u8_p)
             raise ValueError("Failed to preprocess image.")
 
         n_embd = llama_cpp.llama_model_n_embd(llama._model.model)
         embed = (ctypes.c_float * (n_tokens * n_embd))()
-        if not self._llava_cpp.clip_image_batch_encode(self.clip_ctx, llama.n_threads, img_f32_p, embed):
-            self._llava_cpp.clip_image_f32_batch_free(img_f32_p)
-            self._llava_cpp.clip_image_u8_free(img_u8_p)
+        if not self._mtmd_cpp.clip_image_batch_encode(self.clip_ctx, llama.n_threads, img_f32_p, embed):
+            self._mtmd_cpp.clip_image_f32_batch_free(img_f32_p)
+            self._mtmd_cpp.clip_image_u8_free(img_u8_p)
             raise ValueError("Failed to encode image.")
 
-        self._llava_cpp.clip_image_f32_batch_free(img_f32_p)
-        self._llava_cpp.clip_image_u8_free(img_u8_p)
+        self._mtmd_cpp.clip_image_f32_batch_free(img_f32_p)
+        self._mtmd_cpp.clip_image_u8_free(img_u8_p)
         llama_cpp.llama_set_causal_attn(llama.ctx, False)
 
         seq_id_0 = (ctypes.c_int32 * 1)()
diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py
@@ -35,101 +35,101 @@
 
 
 # Specify the base name of the shared library to load
-_libllava_base_name = "llava"
-_libllava_override_path = os.environ.get("LLAVA_CPP_LIB")
-_libllava_base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib" if _libllava_override_path is None else pathlib.Path()
+_libmtmd_base_name = "mtmd"
+_libmtmd_override_path = os.environ.get("mtmd_CPP_LIB")
+_libmtmd_base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib" if _libmtmd_override_path is None else pathlib.Path()
 
 # Load the library
-_libllava = load_shared_library(_libllava_base_name, _libllava_base_path)
+_libmtmd = load_shared_library(_libmtmd_base_name, _libmtmd_base_path)
 
-ctypes_function = ctypes_function_for_shared_library(_libllava)
+ctypes_function = ctypes_function_for_shared_library(_libmtmd)
 
 
 ################################################
-# llava.h
+# mtmd.h
 ################################################
 
 # struct clip_ctx;
 clip_ctx_p = NewType("clip_ctx_p", int)
 clip_ctx_p_ctypes = c_void_p
 
 
-# struct llava_image_embed {
+# struct mtmd_image_embed {
 #     float * embed;
 #     int n_image_pos;
 # };
-class llava_image_embed(Structure):
+class mtmd_image_embed(Structure):
     _fields_ = [
         ("embed", POINTER(c_float)),
         ("n_image_pos", c_int),
     ]
 
 
-# /** sanity check for clip <-> llava embed size match */
-# LLAVA_API bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip);
+# /** sanity check for clip <-> mtmd embed size match */
+# mtmd_API bool mtmd_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip);
 @ctypes_function(
-    "llava_validate_embed_size",
+    "mtmd_validate_embed_size",
     [llama_cpp.llama_context_p_ctypes, clip_ctx_p_ctypes],
     c_bool,
 )
-def llava_validate_embed_size(
+def mtmd_validate_embed_size(
     ctx_llama: llama_cpp.llama_context_p, ctx_clip: clip_ctx_p, /
 ) -> bool:
     ...
 
 
 # /** build an image embed from image file bytes */
-# LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
+# mtmd_API struct mtmd_image_embed * mtmd_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
 @ctypes_function(
-    "llava_image_embed_make_with_bytes",
+    "mtmd_image_embed_make_with_bytes",
     [clip_ctx_p_ctypes, c_int, POINTER(c_uint8), c_int],
-    POINTER(llava_image_embed),
+    POINTER(mtmd_image_embed),
 )
-def llava_image_embed_make_with_bytes(
+def mtmd_image_embed_make_with_bytes(
     ctx_clip: clip_ctx_p,
     n_threads: Union[c_int, int],
     image_bytes: CtypesArray[c_uint8],
     image_bytes_length: Union[c_int, int],
     /,
-) -> "_Pointer[llava_image_embed]":
+) -> "_Pointer[mtmd_image_embed]":
     ...
 
 
 # /** build an image embed from a path to an image filename */
-# LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
+# mtmd_API struct mtmd_image_embed * mtmd_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
 @ctypes_function(
-    "llava_image_embed_make_with_filename",
+    "mtmd_image_embed_make_with_filename",
     [clip_ctx_p_ctypes, c_int, c_char_p],
-    POINTER(llava_image_embed),
+    POINTER(mtmd_image_embed),
 )
-def llava_image_embed_make_with_filename(
+def mtmd_image_embed_make_with_filename(
     ctx_clip: clip_ctx_p, n_threads: Union[c_int, int], image_path: bytes, /
-) -> "_Pointer[llava_image_embed]":
+) -> "_Pointer[mtmd_image_embed]":
     ...
 
 
-# LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
-# /** free an embedding made with llava_image_embed_make_* */
-@ctypes_function("llava_image_embed_free", [POINTER(llava_image_embed)], None)
-def llava_image_embed_free(embed: "_Pointer[llava_image_embed]", /):
+# mtmd_API void mtmd_image_embed_free(struct mtmd_image_embed * embed);
+# /** free an embedding made with mtmd_image_embed_make_* */
+@ctypes_function("mtmd_image_embed_free", [POINTER(mtmd_image_embed)], None)
+def mtmd_image_embed_free(embed: "_Pointer[mtmd_image_embed]", /):
     ...
 
 
 # /** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
-# LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);
+# mtmd_API bool mtmd_eval_image_embed(struct llama_context * ctx_llama, const struct mtmd_image_embed * embed, int n_batch, int * n_past);
 @ctypes_function(
-    "llava_eval_image_embed",
+    "mtmd_eval_image_embed",
     [
         llama_cpp.llama_context_p_ctypes,
-        POINTER(llava_image_embed),
+        POINTER(mtmd_image_embed),
         c_int,
         POINTER(c_int),
     ],
     c_bool,
 )
-def llava_eval_image_embed(
+def mtmd_eval_image_embed(
     ctx_llama: llama_cpp.llama_context_p,
-    embed: "_Pointer[llava_image_embed]",
+    embed: "_Pointer[mtmd_image_embed]",
     n_batch: Union[c_int, int],
     n_past: "_Pointer[c_int]",
     /,