Skip to content

Commit 1d23ae0

Browse files
committed
migrate llava to mtmd
1 parent 54691ca commit 1d23ae0

File tree

3 files changed

+62
-72
lines changed

3 files changed

+62
-72
lines changed

CMakeLists.txt

Lines changed: 10 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 3.21)
33
project(llama_cpp)
44

55
option(LLAMA_BUILD "Build llama.cpp shared library and install alongside python package" ON)
6-
option(LLAVA_BUILD "Build llava shared library and install alongside python package" ON)
6+
option(MTMD_BUILD "Build multimodal (mtmd) shared library and install alongside python package" ON)
77

88
function(llama_cpp_python_install_target target)
99
if(NOT TARGET ${target})
@@ -135,7 +135,7 @@ if (LLAMA_BUILD)
135135
)
136136
endif()
137137

138-
if (LLAVA_BUILD)
138+
if (MTMD_BUILD)
139139
if (LLAMA_CUBLAS OR LLAMA_CUDA)
140140
add_compile_definitions(GGML_USE_CUBLAS)
141141
add_compile_definitions(GGML_USE_CUDA)
@@ -145,36 +145,26 @@ if (LLAMA_BUILD)
145145
add_compile_definitions(GGML_USE_METAL)
146146
endif()
147147

148-
# Building llava
148+
# Building multimodal support using mtmd
149149
add_subdirectory(vendor/llama.cpp/tools/mtmd)
150-
set_target_properties(llava_shared PROPERTIES OUTPUT_NAME "llava")
151150

152151
if (WIN32)
153-
set_target_properties(llava_shared PROPERTIES CUDA_ARCHITECTURES OFF)
152+
set_target_properties(mtmd PROPERTIES CUDA_ARCHITECTURES OFF)
154153
endif()
155-
llama_cpp_python_install_target(llava_shared)
154+
llama_cpp_python_install_target(mtmd)
156155
if (WIN32)
157156
install(
158-
FILES $<TARGET_RUNTIME_DLLS:llava_shared>
157+
FILES $<TARGET_RUNTIME_DLLS:mtmd>
159158
DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib
160159
)
161160
install(
162-
FILES $<TARGET_RUNTIME_DLLS:llava_shared>
161+
FILES $<TARGET_RUNTIME_DLLS:mtmd>
163162
DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib
164163
)
165164
endif()
166165

167-
# Fix for llava build: Add include directory for llama.h
168-
# Move these commands after the add_subdirectory call
169-
target_include_directories(llava PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include)
170-
target_include_directories(llava PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/ggml/include)
171-
172-
if (BUILD_SHARED_LIBS)
173-
target_include_directories(llava_shared PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include)
174-
target_include_directories(llava_shared PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/ggml/include)
175-
endif()
176-
177-
target_include_directories(llama-llava-cli PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include)
178-
target_include_directories(llama-minicpmv-cli PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include)
166+
# Add include directories for mtmd
167+
target_include_directories(mtmd PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include)
168+
target_include_directories(mtmd PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/ggml/include)
179169
endif()
180170
endif()

llama_cpp/llama_chat_format.py

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2714,23 +2714,23 @@ class Llava15ChatHandler:
27142714
)
27152715

27162716
def __init__(self, clip_model_path: str, verbose: bool = True):
2717-
import llama_cpp.llava_cpp as llava_cpp
2717+
import llama_cpp.mtmd_cpp as mtmd_cpp
27182718

27192719
self.clip_model_path = clip_model_path
27202720
self.verbose = verbose
27212721

2722-
self._llava_cpp = llava_cpp # TODO: Fix
2722+
self._mtmd_cpp = mtmd_cpp # TODO: Fix
27232723
self._exit_stack = ExitStack()
27242724
self._last_image_embed: Optional[
2725-
llava_cpp.CtypesPointer[llava_cpp.llava_image_embed]
2725+
mtmd_cpp.CtypesPointer[mtmd_cpp.mtmd_cpp_image_embed]
27262726
] = None
27272727
self._last_image_hash: Optional[int] = None
27282728

27292729
if not os.path.exists(clip_model_path):
27302730
raise ValueError(f"Clip model path does not exist: {clip_model_path}")
27312731

27322732
with suppress_stdout_stderr(disable=self.verbose):
2733-
clip_ctx = self._llava_cpp.clip_model_load(self.clip_model_path.encode(), 0)
2733+
clip_ctx = self._mtmd_cpp.clip_model_load(self.clip_model_path.encode(), 0)
27342734

27352735
if clip_ctx is None:
27362736
raise ValueError(f"Failed to load clip model: {clip_model_path}")
@@ -2739,14 +2739,14 @@ def __init__(self, clip_model_path: str, verbose: bool = True):
27392739

27402740
def clip_free():
27412741
with suppress_stdout_stderr(disable=self.verbose):
2742-
self._llava_cpp.clip_free(self.clip_ctx)
2742+
self._mtmd_cpp.clip_free(self.clip_ctx)
27432743

27442744
self._exit_stack.callback(clip_free)
27452745

27462746
def last_image_embed_free():
27472747
with suppress_stdout_stderr(disable=self.verbose):
27482748
if self._last_image_embed is not None:
2749-
self._llava_cpp.llava_image_embed_free(self._last_image_embed)
2749+
self._mtmd_cpp.mtmd_cpp_image_embed_free(self._last_image_embed)
27502750
self._last_image_embed = None
27512751

27522752
self._exit_stack.callback(last_image_embed_free)
@@ -2764,10 +2764,10 @@ def _embed_image_bytes(self, image_bytes: bytes, n_threads_batch: int = 1):
27642764
with suppress_stdout_stderr(disable=self.verbose):
27652765
# Free the previous image embed
27662766
if self._last_image_embed is not None:
2767-
self._llava_cpp.llava_image_embed_free(self._last_image_embed)
2767+
self._mtmd_cpp.mtmd_cpp_image_embed_free(self._last_image_embed)
27682768
self._last_image_embed = None
27692769
self._last_image_hash = None
2770-
embed = self._llava_cpp.llava_image_embed_make_with_bytes(
2770+
embed = self._mtmd_cpp.mtmd_cpp_image_embed_make_with_bytes(
27712771
self.clip_ctx,
27722772
n_threads_batch,
27732773
(ctypes.c_uint8 * len(image_bytes)).from_buffer(
@@ -2955,7 +2955,7 @@ def eval_image(self, llama: llama.Llama, image_url: str):
29552955
n_past = ctypes.c_int(llama.n_tokens)
29562956
n_past_p = ctypes.pointer(n_past)
29572957
with suppress_stdout_stderr(disable=self.verbose):
2958-
self._llava_cpp.llava_eval_image_embed(
2958+
self._mtmd_cpp.mtmd_cpp_eval_image_embed(
29592959
llama.ctx,
29602960
embed,
29612961
llama.n_batch,
@@ -3483,30 +3483,30 @@ def eval_image(self, llama: llama.Llama, image_url: str):
34833483
)
34843484

34853485
img_bytes = self.load_image(image_url)
3486-
img_u8_p = self._llava_cpp.clip_image_u8_init()
3487-
if not self._llava_cpp.clip_image_load_from_bytes(
3486+
img_u8_p = self._mtmd_cpp.clip_image_u8_init()
3487+
if not self._mtmd_cpp.clip_image_load_from_bytes(
34883488
ctypes.create_string_buffer(img_bytes, len(img_bytes)),
34893489
ctypes.c_size_t(len(img_bytes)),
34903490
img_u8_p,
34913491
):
3492-
self._llava_cpp.clip_image_u8_free(img_u8_p)
3492+
self._mtmd_cpp.clip_image_u8_free(img_u8_p)
34933493
raise ValueError("Failed to load image.")
34943494

3495-
img_f32_p = self._llava_cpp.clip_image_f32_batch_init()
3496-
if not self._llava_cpp.clip_image_preprocess(self.clip_ctx, img_u8_p, img_f32_p):
3497-
self._llava_cpp.clip_image_f32_batch_free(img_f32_p)
3498-
self._llava_cpp.clip_image_u8_free(img_u8_p)
3495+
img_f32_p = self._mtmd_cpp.clip_image_f32_batch_init()
3496+
if not self._mtmd_cpp.clip_image_preprocess(self.clip_ctx, img_u8_p, img_f32_p):
3497+
self._mtmd_cpp.clip_image_f32_batch_free(img_f32_p)
3498+
self._mtmd_cpp.clip_image_u8_free(img_u8_p)
34993499
raise ValueError("Failed to preprocess image.")
35003500

35013501
n_embd = llama_cpp.llama_model_n_embd(llama._model.model)
35023502
embed = (ctypes.c_float * (n_tokens * n_embd))()
3503-
if not self._llava_cpp.clip_image_batch_encode(self.clip_ctx, llama.n_threads, img_f32_p, embed):
3504-
self._llava_cpp.clip_image_f32_batch_free(img_f32_p)
3505-
self._llava_cpp.clip_image_u8_free(img_u8_p)
3503+
if not self._mtmd_cpp.clip_image_batch_encode(self.clip_ctx, llama.n_threads, img_f32_p, embed):
3504+
self._mtmd_cpp.clip_image_f32_batch_free(img_f32_p)
3505+
self._mtmd_cpp.clip_image_u8_free(img_u8_p)
35063506
raise ValueError("Failed to encode image.")
35073507

3508-
self._llava_cpp.clip_image_f32_batch_free(img_f32_p)
3509-
self._llava_cpp.clip_image_u8_free(img_u8_p)
3508+
self._mtmd_cpp.clip_image_f32_batch_free(img_f32_p)
3509+
self._mtmd_cpp.clip_image_u8_free(img_u8_p)
35103510
llama_cpp.llama_set_causal_attn(llama.ctx, False)
35113511

35123512
seq_id_0 = (ctypes.c_int32 * 1)()
Lines changed: 31 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -35,101 +35,101 @@
3535

3636

3737
# Specify the base name of the shared library to load
38-
_libllava_base_name = "llava"
39-
_libllava_override_path = os.environ.get("LLAVA_CPP_LIB")
40-
_libllava_base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib" if _libllava_override_path is None else pathlib.Path()
38+
_libmtmd_base_name = "mtmd"
39+
_libmtmd_override_path = os.environ.get("mtmd_CPP_LIB")
40+
_libmtmd_base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib" if _libmtmd_override_path is None else pathlib.Path()
4141

4242
# Load the library
43-
_libllava = load_shared_library(_libllava_base_name, _libllava_base_path)
43+
_libmtmd = load_shared_library(_libmtmd_base_name, _libmtmd_base_path)
4444

45-
ctypes_function = ctypes_function_for_shared_library(_libllava)
45+
ctypes_function = ctypes_function_for_shared_library(_libmtmd)
4646

4747

4848
################################################
49-
# llava.h
49+
# mtmd.h
5050
################################################
5151

5252
# struct clip_ctx;
5353
clip_ctx_p = NewType("clip_ctx_p", int)
5454
clip_ctx_p_ctypes = c_void_p
5555

5656

57-
# struct llava_image_embed {
57+
# struct mtmd_image_embed {
5858
# float * embed;
5959
# int n_image_pos;
6060
# };
61-
class llava_image_embed(Structure):
61+
class mtmd_image_embed(Structure):
6262
_fields_ = [
6363
("embed", POINTER(c_float)),
6464
("n_image_pos", c_int),
6565
]
6666

6767

68-
# /** sanity check for clip <-> llava embed size match */
69-
# LLAVA_API bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip);
68+
# /** sanity check for clip <-> mtmd embed size match */
69+
# mtmd_API bool mtmd_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip);
7070
@ctypes_function(
71-
"llava_validate_embed_size",
71+
"mtmd_validate_embed_size",
7272
[llama_cpp.llama_context_p_ctypes, clip_ctx_p_ctypes],
7373
c_bool,
7474
)
75-
def llava_validate_embed_size(
75+
def mtmd_validate_embed_size(
7676
ctx_llama: llama_cpp.llama_context_p, ctx_clip: clip_ctx_p, /
7777
) -> bool:
7878
...
7979

8080

8181
# /** build an image embed from image file bytes */
82-
# LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
82+
# mtmd_API struct mtmd_image_embed * mtmd_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
8383
@ctypes_function(
84-
"llava_image_embed_make_with_bytes",
84+
"mtmd_image_embed_make_with_bytes",
8585
[clip_ctx_p_ctypes, c_int, POINTER(c_uint8), c_int],
86-
POINTER(llava_image_embed),
86+
POINTER(mtmd_image_embed),
8787
)
88-
def llava_image_embed_make_with_bytes(
88+
def mtmd_image_embed_make_with_bytes(
8989
ctx_clip: clip_ctx_p,
9090
n_threads: Union[c_int, int],
9191
image_bytes: CtypesArray[c_uint8],
9292
image_bytes_length: Union[c_int, int],
9393
/,
94-
) -> "_Pointer[llava_image_embed]":
94+
) -> "_Pointer[mtmd_image_embed]":
9595
...
9696

9797

9898
# /** build an image embed from a path to an image filename */
99-
# LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
99+
# mtmd_API struct mtmd_image_embed * mtmd_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
100100
@ctypes_function(
101-
"llava_image_embed_make_with_filename",
101+
"mtmd_image_embed_make_with_filename",
102102
[clip_ctx_p_ctypes, c_int, c_char_p],
103-
POINTER(llava_image_embed),
103+
POINTER(mtmd_image_embed),
104104
)
105-
def llava_image_embed_make_with_filename(
105+
def mtmd_image_embed_make_with_filename(
106106
ctx_clip: clip_ctx_p, n_threads: Union[c_int, int], image_path: bytes, /
107-
) -> "_Pointer[llava_image_embed]":
107+
) -> "_Pointer[mtmd_image_embed]":
108108
...
109109

110110

111-
# LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
112-
# /** free an embedding made with llava_image_embed_make_* */
113-
@ctypes_function("llava_image_embed_free", [POINTER(llava_image_embed)], None)
114-
def llava_image_embed_free(embed: "_Pointer[llava_image_embed]", /):
111+
# mtmd_API void mtmd_image_embed_free(struct mtmd_image_embed * embed);
112+
# /** free an embedding made with mtmd_image_embed_make_* */
113+
@ctypes_function("mtmd_image_embed_free", [POINTER(mtmd_image_embed)], None)
114+
def mtmd_image_embed_free(embed: "_Pointer[mtmd_image_embed]", /):
115115
...
116116

117117

118118
# /** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
119-
# LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);
119+
# mtmd_API bool mtmd_eval_image_embed(struct llama_context * ctx_llama, const struct mtmd_image_embed * embed, int n_batch, int * n_past);
120120
@ctypes_function(
121-
"llava_eval_image_embed",
121+
"mtmd_eval_image_embed",
122122
[
123123
llama_cpp.llama_context_p_ctypes,
124-
POINTER(llava_image_embed),
124+
POINTER(mtmd_image_embed),
125125
c_int,
126126
POINTER(c_int),
127127
],
128128
c_bool,
129129
)
130-
def llava_eval_image_embed(
130+
def mtmd_eval_image_embed(
131131
ctx_llama: llama_cpp.llama_context_p,
132-
embed: "_Pointer[llava_image_embed]",
132+
embed: "_Pointer[mtmd_image_embed]",
133133
n_batch: Union[c_int, int],
134134
n_past: "_Pointer[c_int]",
135135
/,

0 commit comments

Comments
 (0)