diff --git a/.python-version b/.python-version
new file mode 100644
index 0000000..6324d40
--- /dev/null
+++ b/.python-version
@@ -0,0 +1 @@
+3.14
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f1c2230..e21606b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -19,7 +19,7 @@ pybind11_add_module(python-samplerate src/samplerate.cpp)
 target_include_directories(python-samplerate PRIVATE ./external/libsamplerate/include)
 
 if(MSVC)
-    target_compile_options(python-samplerate PRIVATE /EHsc /MP /bigobj)
+    target_compile_options(python-samplerate PRIVATE /EHsc /MP /bigobj /O2)
     set(CMAKE_EXE_LINKER_FLAGS /MANIFEST:NO)
 endif()
 
@@ -29,19 +29,29 @@ if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR
     target_compile_options(python-samplerate PRIVATE -std=c++14 -O3 -Wall -Wextra -fPIC)
 endif()
 
-### stick the package and libsamplerate version into the module
-target_compile_definitions(python-samplerate
-    PUBLIC LIBSAMPLERATE_VERSION="${LIBSAMPLERATE_VERSION}"
-    PRIVATE $<$<BOOL:${PACKAGE_VERSION_INFO}>:VERSION_INFO="${PACKAGE_VERSION_INFO}">
-)
-
-### Final target setup
+### Final target setup - must be before compile_definitions so LTO generator expression works
 set_target_properties(
     python-samplerate
     PROPERTIES
         PREFIX ""
         OUTPUT_NAME "samplerate"
         LINKER_LANGUAGE C
+        INTERPROCEDURAL_OPTIMIZATION TRUE
     )
 
+### stick the package and libsamplerate version into the module
+target_compile_definitions(python-samplerate
+    PUBLIC LIBSAMPLERATE_VERSION="${LIBSAMPLERATE_VERSION}"
+    PRIVATE $<$<BOOL:${PACKAGE_VERSION_INFO}>:VERSION_INFO="${PACKAGE_VERSION_INFO}">
+    # Build information for debugging
+    PRIVATE BUILD_TYPE="$<CONFIG>"
+    PRIVATE COMPILER_ID="${CMAKE_CXX_COMPILER_ID}"
+    PRIVATE COMPILER_VERSION="${CMAKE_CXX_COMPILER_VERSION}"
+    PRIVATE CMAKE_VERSION="${CMAKE_VERSION}"
+    PRIVATE TARGET_ARCH="${CMAKE_SYSTEM_PROCESSOR}"
+    PRIVATE TARGET_OS="${CMAKE_SYSTEM_NAME}"
+    PUBLIC PYBIND11_VERSION_INFO="${PYBIND11_VERSION_INFO}" 
+    PRIVATE LTO_ENABLED=$<BOOL:$<TARGET_PROPERTY:python-samplerate,INTERPROCEDURAL_OPTIMIZATION>>
+)
+
 target_link_libraries(python-samplerate PUBLIC samplerate)
diff --git a/README.md b/README.md
index 9b7dce4..da4d6e0 100644
--- a/README.md
+++ b/README.md
@@ -54,9 +54,47 @@ assert np.allclose(output_data_simple, output_data_full)
 
 # See `samplerate.CallbackResampler` for the Callback API, or
 # `examples/play_modulation.py` for an example.
+
+# Callback API Example
+def producer():
+    # Generate data in chunks
+    for i in range(10):
+        yield np.random.uniform(-1, 1, 1024).astype(np.float32)
+    yield None # Signal end of stream
+
+data_iter = producer()
+callback = lambda: next(data_iter)
+
+resampler = samplerate.CallbackResampler(callback, ratio, converter)
+output_chunks = []
+while True:
+    # Read chunks of resampled data
+    chunk = resampler.read(512) 
+    if chunk.shape[0] == 0:
+        break
+    output_chunks.append(chunk)
 ```
 
-See `samplerate.resample`, `samplerate.Resampler`, and `samplerate.CallbackResampler` in the API documentation for details.
+## Performance Tips
+
+To get the maximum performance from `samplerate`:
+
+1.  **Use `np.float32`**: The underlying `libsamplerate` library operates on 32-bit floats. Passing `np.float64` (default numpy float) or integer arrays triggers an implicit copy and cast, which can be expensive.
+    ```python
+    # Fast (no copy)
+    data = np.zeros(1000, dtype=np.float32)
+    samplerate.resample(data, 1.5)
+
+    # Slower (implicit copy + cast)
+    data = np.zeros(1000, dtype=np.float64) 
+    samplerate.resample(data, 1.5)
+    ```
+2.  **Use C-Contiguous Arrays**: Ensure your input arrays are C-contiguous (row-major). Non-contiguous arrays (e.g., column slices) will also trigger a copy.
+3.  **Adjust GIL Threshold**: If you are processing many small chunks in a multi-threaded application, the default "auto" GIL release threshold (1000 frames) might be too high or too low. You can tune it:
+    ```python
+    # Release GIL even for small chunks (e.g. > 100 frames)
+    samplerate.set_gil_release_threshold(100)
+    ```
 
 ## Multi-threading and GIL Control
 
@@ -67,6 +105,7 @@ import samplerate
 
 # Default: "auto" mode - releases GIL only for large data (>= 1000 frames)
 # Balances single-threaded performance with multi-threading capability
+# The threshold is configurable: samplerate.set_gil_release_threshold(2000)
 output = samplerate.resample(input_data, ratio)
 
 # Force GIL release - best for multi-threaded applications
diff --git a/external/CMakeLists.txt b/external/CMakeLists.txt
index 239b595..a4254b7 100644
--- a/external/CMakeLists.txt
+++ b/external/CMakeLists.txt
@@ -1,4 +1,6 @@
 include(FetchContent)
+# Set pybind11 Python finding policy to use new FindPython
+set(PYBIND11_FINDPYTHON ON CACHE BOOL "Use FindPython instead of FindPythonInterp")
 
 # pybind11
 FetchContent_Declare(
@@ -6,9 +8,13 @@ FetchContent_Declare(
   GIT_REPOSITORY https://github.com/pybind/pybind11
   GIT_TAG f5fbe867d2d26e4a0a9177a51f6e568868ad3dc8 # 3.0.1
 )
+## Change this if you update pybind11 version
+set(PYBIND11_VERSION_INFO "3.0.1" CACHE STRING "pybind11 version")  # <-- update pybind11 version here
+
 
 FetchContent_MakeAvailable(pybind11)
 
+
 # libsamplerate
 set(BUILD_TESTING OFF CACHE BOOL "Disable libsamplerate test build")
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
diff --git a/pyproject.toml b/pyproject.toml
index b424d40..3db5df2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -51,3 +51,11 @@ build-frontend = "build[uv]"
 build = ["cp39-*", "cp310-*", "cp311-*", "cp312-*", "cp313-*","cp314-*"]
 # Skip 32-bit builds and musllinux wheels
 skip = ["*-win32", "*-manylinux_i686", "*-musllinux*"]
+
+[tool.uv]
+cache-keys = [
+  { file = "pyproject.toml" }, 
+  { file = "setup.py" },
+  { file = "CMakeLists.txt" },
+  { dir = "external" },
+  { dir = "src" }]
\ No newline at end of file
diff --git a/src/samplerate.cpp b/src/samplerate.cpp
index c15eaa7..f0eba25 100644
--- a/src/samplerate.cpp
+++ b/src/samplerate.cpp
@@ -40,6 +40,35 @@
 #define VERSION_INFO "nightly"
 #endif
 
+// Build information defaults (set by CMake)
+#ifndef BUILD_TYPE
+#define BUILD_TYPE "unknown"
+#endif
+#ifndef COMPILER_ID
+#define COMPILER_ID "unknown"
+#endif
+#ifndef COMPILER_VERSION
+#define COMPILER_VERSION "unknown"
+#endif
+#ifndef CMAKE_VERSION
+#define CMAKE_VERSION "unknown"
+#endif
+#ifndef TARGET_ARCH
+#define TARGET_ARCH "unknown"
+#endif
+#ifndef TARGET_OS
+#define TARGET_OS "unknown"
+#endif
+#ifndef PYBIND11_VERSION_INFO
+#define PYBIND11_VERSION_INFO "unknown"
+#endif
+#ifndef LIBSAMPLERATE_VERSION
+#define LIBSAMPLERATE_VERSION "unknown"
+#endif
+#ifndef LTO_ENABLED
+#define LTO_ENABLED 0
+#endif
+
 // This value was empirically and somewhat arbitrarily chosen; increase it for further safety.
 #define END_OF_INPUT_EXTRA_OUTPUT_FRAMES 10000
 
@@ -51,7 +80,7 @@
 // with multi-threaded performance (allowing parallelism for large data).
 // Empirically chosen based on benchmarks showing that at 1000 frames, the GIL
 // overhead is < 1% of total execution time for even the fastest converter types.
-#define GIL_RELEASE_THRESHOLD_FRAMES 1000
+long gil_release_threshold_frames = 1000;
 
 namespace py = pybind11;
 using namespace pybind11::literals;
@@ -72,13 +101,13 @@ namespace samplerate {
 bool should_release_gil(const py::object &release_gil, long num_frames) {
   if (release_gil.is_none()) {
     // "auto" mode: release GIL only for large data sizes
-    return num_frames >= GIL_RELEASE_THRESHOLD_FRAMES;
+    return num_frames >= gil_release_threshold_frames;
   } else if (py::isinstance<py::bool_>(release_gil)) {
     return release_gil.cast<bool>();
   } else if (py::isinstance<py::str>(release_gil)) {
     std::string s = release_gil.cast<std::string>();
     if (s == "auto") {
-      return num_frames >= GIL_RELEASE_THRESHOLD_FRAMES;
+      return num_frames >= gil_release_threshold_frames;
     }
     throw std::domain_error("Invalid release_gil value. Use True, False, None, or 'auto'.");
   }
@@ -177,7 +206,7 @@ class Resampler {
   ~Resampler() { src_delete(_state); }  // src_delete handles nullptr case
 
   py::array_t<float, py::array::c_style> process(
-      py::array_t<float, py::array::c_style | py::array::forcecast> input,
+      const py::array_t<float, py::array::c_style | py::array::forcecast> &input,
       double sr_ratio, bool end_of_input,
       const py::object &release_gil = py::none()) {
     // accessors for the arrays
@@ -213,7 +242,7 @@ class Resampler {
     SRC_DATA src_data = {
         static_cast<float *>(inbuf.ptr),   // data_in
         static_cast<float *>(outbuf.ptr),  // data_out
-        inbuf.shape[0],                    // input_frames
+        static_cast<long>(inbuf.shape[0]), // input_frames
         long(new_size),                    // output_frames
         0,             // input_frames_used, filled by libsamplerate
         0,             // output_frames_gen, filled by libsamplerate
@@ -505,7 +534,7 @@ py::array_t<float, py::array::c_style> resample(
   SRC_DATA src_data = {
       static_cast<float *>(inbuf.ptr),   // data_in
       static_cast<float *>(outbuf.ptr),  // data_out
-      inbuf.shape[0],                    // input_frames
+      static_cast<long>(inbuf.shape[0]), // input_frames
       long(new_size),                    // output_frames
       0,        // input_frames_used, filled by libsamplerate
       0,        // output_frames_gen, filled by libsamplerate
@@ -559,6 +588,78 @@ PYBIND11_MODULE(samplerate, m) {
   m.attr("__version__") = VERSION_INFO;
   m.attr("__libsamplerate_version__") = LIBSAMPLERATE_VERSION;
 
+  m.def("set_gil_release_threshold", [](long threshold) {
+    gil_release_threshold_frames = threshold;
+  }, "Set the minimum number of frames required to release the GIL in 'auto' mode.");
+
+  m.def("get_gil_release_threshold", []() {
+    return gil_release_threshold_frames;
+  }, "Get the minimum number of frames required to release the GIL in 'auto' mode.");
+
+  m.def("get_build_info", []() {
+    py::dict info;
+    info["version"] = VERSION_INFO;
+    info["libsamplerate_version"] = LIBSAMPLERATE_VERSION;
+    info["build_type"] = BUILD_TYPE;
+    info["compiler_id"] = COMPILER_ID;
+    info["compiler_version"] = COMPILER_VERSION;
+    info["cmake_version"] = CMAKE_VERSION;
+    info["target_arch"] = TARGET_ARCH;
+    info["target_os"] = TARGET_OS;
+    info["pybind11_version"] = PYBIND11_VERSION_INFO;
+    // C++ standard - MSVC uses _MSVC_LANG instead of __cplusplus
+#ifdef _MSVC_LANG
+    #define CPP_STD_VALUE _MSVC_LANG
+#else
+    #define CPP_STD_VALUE __cplusplus
+#endif
+#if CPP_STD_VALUE >= 202002L
+    info["cpp_standard"] = "C++20";
+#elif CPP_STD_VALUE >= 201703L
+    info["cpp_standard"] = "C++17";
+#elif CPP_STD_VALUE >= 201402L
+    info["cpp_standard"] = "C++14";
+#elif CPP_STD_VALUE >= 201103L
+    info["cpp_standard"] = "C++11";
+#else
+    info["cpp_standard"] = "pre-C++11";
+#endif
+#undef CPP_STD_VALUE
+    // LTO status (passed from CMake)
+#if LTO_ENABLED
+    info["lto_enabled"] = true;
+#else
+    info["lto_enabled"] = false;
+#endif
+    // Pointer size (32 vs 64 bit)
+    info["pointer_size_bits"] = sizeof(void*) * 8;
+    // Float size sanity check
+    info["float_size_bytes"] = sizeof(float);
+    info["gil_release_threshold"] = gil_release_threshold_frames;
+    return info;
+  }, R"doc(
+Get detailed build information for debugging purposes.
+
+Returns
+-------
+dict
+    Dictionary containing:
+    - version: Package version
+    - libsamplerate_version: libsamplerate library version
+    - build_type: Build configuration (Release, Debug, etc.)
+    - compiler_id: Compiler used (MSVC, GNU, Clang, etc.)
+    - compiler_version: Compiler version string
+    - cmake_version: CMake version used for build
+    - target_arch: Target architecture (x86_64, arm64, etc.)
+    - target_os: Target operating system
+    - pybind11_version: pybind11 version
+    - cpp_standard: C++ standard used
+    - lto_enabled: Whether Link Time Optimization was enabled
+    - pointer_size_bits: Pointer size (32 or 64)
+    - float_size_bytes: Size of float type (should be 4)
+    - gil_release_threshold: Current GIL release threshold
+)doc");
+
   auto m_exceptions = m.def_submodule(
       "exceptions", "Sub-module containing sampling exceptions");
   auto m_converters = m.def_submodule(
diff --git a/src/samplerate.pyi b/src/samplerate.pyi
new file mode 100644
index 0000000..f926391
--- /dev/null
+++ b/src/samplerate.pyi
@@ -0,0 +1,81 @@
+from typing import Optional, Union, Callable, Iterator, Tuple, overload, TypedDict
+import numpy as np
+import numpy.typing as npt
+
+class BuildInfo(TypedDict):
+    version: str
+    libsamplerate_version: str
+    build_type: str
+    compiler_id: str
+    compiler_version: str
+    cmake_version: str
+    target_arch: str
+    target_os: str
+    pybind11_version: str
+    cpp_standard: str
+    lto_enabled: bool
+    pointer_size_bits: int
+    float_size_bytes: int
+    gil_release_threshold: int
+
+class ConverterType:
+    sinc_best: int
+    sinc_medium: int
+    sinc_fastest: int
+    zero_order_hold: int
+    linear: int
+
+class ResamplingError(RuntimeError): ...
+
+def set_gil_release_threshold(threshold: int) -> None: ...
+def get_gil_release_threshold() -> int: ...
+def get_build_info() -> BuildInfo: ...
+
+def resample(
+    input_data: npt.NDArray[np.float32],
+    ratio: float,
+    converter_type: Union[ConverterType, str, int] = "sinc_best",
+    verbose: bool = False,
+    release_gil: Optional[Union[bool, str]] = None,
+) -> npt.NDArray[np.float32]: ...
+
+class Resampler:
+    converter_type: int
+    channels: int
+    def __init__(
+        self,
+        converter_type: Union[ConverterType, str, int] = "sinc_best",
+        channels: int = 1,
+    ) -> None: ...
+    def process(
+        self,
+        input_data: npt.NDArray[np.float32],
+        ratio: float,
+        end_of_input: bool = False,
+        release_gil: Optional[Union[bool, str]] = None,
+    ) -> npt.NDArray[np.float32]: ...
+    def reset(self) -> None: ...
+    def set_ratio(self, new_ratio: float) -> None: ...
+    def clone(self) -> "Resampler": ...
+
+class CallbackResampler:
+    ratio: float
+    converter_type: int
+    channels: int
+    def __init__(
+        self,
+        callback: Callable[[], Optional[npt.NDArray[np.float32]]],
+        ratio: float,
+        converter_type: Union[ConverterType, str, int] = "sinc_best",
+        channels: int = 1,
+    ) -> None: ...
+    def read(
+        self,
+        num_frames: int,
+        release_gil: Optional[Union[bool, str]] = None,
+    ) -> npt.NDArray[np.float32]: ...
+    def reset(self) -> None: ...
+    def set_starting_ratio(self, new_ratio: float) -> None: ...
+    def clone(self) -> "CallbackResampler": ...
+    def __enter__(self) -> "CallbackResampler": ...
+    def __exit__(self, exc_type, exc, exc_tb) -> None: ...
diff --git a/tests/test_datatype_performance.py b/tests/test_datatype_performance.py
new file mode 100644
index 0000000..de14797
--- /dev/null
+++ b/tests/test_datatype_performance.py
@@ -0,0 +1,45 @@
+import time
+import numpy as np
+import samplerate
+
+def benchmark_resample(input_data, ratio=1.5, converter='sinc_fastest'):
+    start_time = time.perf_counter()
+    samplerate.resample(input_data, ratio, converter)
+    end_time = time.perf_counter()
+    return end_time - start_time
+
+def test_datatype_performance():
+    # Generate 1 second of audio at 44.1kHz
+    fs = 44100
+    duration = 15.0
+    t = np.arange(fs * duration) / fs
+    
+    # Create float64 (default) and float32 arrays
+    data_float64 = np.sin(2 * np.pi * 440 * t)
+    data_float32 = data_float64.astype(np.float32)
+    
+    # Warmup
+    benchmark_resample(data_float32)
+    
+    # Benchmark float32 (native)
+    times_f32 = []
+    for _ in range(10):
+        times_f32.append(benchmark_resample(data_float32))
+    avg_f32 = np.mean(times_f32)
+    
+    # Benchmark float64 (requires conversion)
+    times_f64 = []
+    for _ in range(10):
+        times_f64.append(benchmark_resample(data_float64))
+    avg_f64 = np.mean(times_f64)
+    
+    print(f"\nPerformance Comparison (1s audio, sinc_fastest):")
+    print(f"float32 (native): {avg_f32*1000:.3f} ms")
+    print(f"float64 (copy):   {avg_f64*1000:.3f} ms")
+    print(f"Overhead:         {(avg_f64 - avg_f32)*1000:.3f} ms ({(avg_f64/avg_f32 - 1)*100:.1f}%)")
+    
+    # We expect float32 to be faster, but we won't fail the test if it isn't 
+    # (machine noise can affect small benchmarks), just report it.
+
+if __name__ == "__main__":
+    test_datatype_performance()
\ No newline at end of file