diff --git a/src/unicode.cpp b/src/unicode.cpp index bb44edfaddf..5d66f27c74e 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -497,9 +497,15 @@ static std::vector unicode_regex_split_custom_llama3(const std::string & return bpe_offsets; } +#ifdef _MSC_VER +constexpr auto regex_flags = std::regex_constants::ECMAScript; +#else +constexpr auto regex_flags = std::regex_constants::nosubs | std::regex_constants::optimize; +#endif + // use std::wregex to split the text static std::vector unicode_regex_split_stl(const std::wstring & wtext, const std::wstring & regex_expr, const std::vector & offsets) { - std::wregex expr(regex_expr, std::regex_constants::optimize | std::regex_constants::nosubs); + std::wregex expr(regex_expr, regex_flags); std::vector bpe_offsets; // store the offset of each word bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size size_t start = 0; @@ -529,7 +535,7 @@ static std::vector unicode_regex_split_stl(const std::wstring & wtext, c // use std::regex to split the text static std::vector unicode_regex_split_stl(const std::string & text, const std::string & regex_expr, const std::vector & offsets) { - std::regex expr(regex_expr, std::regex_constants::optimize | std::regex_constants::nosubs); + std::regex expr(regex_expr, regex_flags); std::vector bpe_offsets; // store the offset of each word bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size size_t start = 0;