diff --git a/src/scope/core/pipelines/process.py b/src/scope/core/pipelines/process.py
index 8704c2626..2f8e15231 100644
--- a/src/scope/core/pipelines/process.py
+++ b/src/scope/core/pipelines/process.py
@@ -16,8 +16,8 @@ def preprocess_chunk(
     frames = []
 
     for frame in chunk:
-        # Move to pipeline device
-        frame = frame.to(device=device, dtype=dtype)
+        # Move to pipeline device first (likely as uint8), then convert dtype on device
+        frame = frame.to(device=device).to(dtype=dtype)
         frame = rearrange(frame, "T H W C -> T C H W")
 
         _, _, H, W = frame.shape
diff --git a/src/scope/server/frame_processor.py b/src/scope/server/frame_processor.py
index b6e18bbb6..ca58d0c24 100644
--- a/src/scope/server/frame_processor.py
+++ b/src/scope/server/frame_processor.py
@@ -836,15 +836,14 @@ def prepare_chunk(self, chunk_size: int) -> list[torch.Tensor]:
         for _ in range(last_idx + 1):
             self.frame_buffer.popleft()
 
-        # Convert VideoFrames to tensors
+        # Convert VideoFrames to tensors (keep as uint8, GPU will handle dtype conversion)
         tensor_frames = []
         for video_frame in video_frames:
-            # Convert VideoFrame into (1, H, W, C) tensor on cpu
+            # Convert VideoFrame into (1, H, W, C) uint8 tensor on cpu
             # The T=1 dimension is expected by preprocess_chunk which rearranges T H W C -> T C H W
-            tensor = (
-                torch.from_numpy(video_frame.to_ndarray(format="rgb24"))
-                .float()
-                .unsqueeze(0)
+            # Note: We keep uint8 here and let pipeline preprocess chunk to target dtype on GPU
+            tensor = torch.from_numpy(video_frame.to_ndarray(format="rgb24")).unsqueeze(
+                0
             )
             tensor_frames.append(tensor)