move latent shuffle logic to latents-preview.h

stduhpf · stduhpf · commit 615c6f372969 · 2025-11-29T12:35:35.000+01:00
diff --git a/latent-preview.h b/latent-preview.h
@@ -163,6 +163,133 @@ const float sd_latent_rgb_proj[4][3] = {
     {-0.178022f, -0.200862f, -0.678514f}};
 float sd_latent_rgb_bias[3] = {-0.017478f, -0.055834f, -0.105825f};
 
+void unpatchify_latents(ggml_tensor* latents, int patch_size, char* dst_buf) {
+    const int64_t N    = latents->ne[3];
+    const int64_t C_in = latents->ne[2];
+    const int64_t H_in = latents->ne[1];
+    const int64_t W_in = latents->ne[0];
+
+    const int64_t C_out = C_in / (patch_size * patch_size);
+    const int64_t H_out = H_in * patch_size;
+    const int64_t W_out = W_in * patch_size;
+
+    const char* src_ptr = (char*)latents->data;
+    size_t elem_size    = latents->nb[0];
+
+    bool alloc_dst_buf  = dst_buf == nullptr;
+    size_t dst_buf_size = latents->nb[3];
+    if (alloc_dst_buf) {
+        dst_buf = (char*)malloc(dst_buf_size);
+    }
+
+    size_t dst_stride_w = elem_size;
+    size_t dst_stride_h = dst_stride_w * W_out;
+    size_t dst_stride_c = dst_stride_h * H_out;
+    size_t dst_stride_n = dst_stride_c * C_out;
+
+    size_t dst_step_w = dst_stride_w * patch_size;
+    size_t dst_step_h = dst_stride_h * patch_size;
+
+    for (int64_t n = 0; n < N; ++n) {
+        for (int64_t c = 0; c < C_in; ++c) {
+            int64_t c_out = c / (patch_size * patch_size);
+            int64_t rem   = c % (patch_size * patch_size);
+            int64_t py    = rem / patch_size;
+            int64_t px    = rem % patch_size;
+
+            char* dst_layer = dst_buf + n * dst_stride_n + c_out * dst_stride_c + py * dst_stride_h + px * dst_stride_w;
+
+            for (int64_t y = 0; y < H_in; ++y) {
+                char* dst_row = dst_layer + y * dst_step_h;
+
+                for (int64_t x = 0; x < W_in; ++x) {
+                    memcpy(dst_row + x * dst_step_w, src_ptr, elem_size);
+                    src_ptr += elem_size;
+                }
+            }
+        }
+    }
+
+    memcpy(latents->data, dst_buf, dst_buf_size);
+
+    latents->ne[0] = W_out;
+    latents->ne[1] = H_out;
+    latents->ne[2] = C_out;
+
+    latents->nb[0] = dst_stride_w;
+    latents->nb[1] = dst_stride_h;
+    latents->nb[2] = dst_stride_c;
+    latents->nb[3] = dst_stride_n;
+    if (alloc_dst_buf) {
+        free(dst_buf);
+    }
+}
+
+void repatchify_latents(ggml_tensor* latents, int patch_size, char* dst_buf) {
+    const int64_t N     = latents->ne[3];
+    const int64_t C_in  = latents->ne[2];
+    const int64_t H_in  = latents->ne[1];
+    const int64_t W_in  = latents->ne[0];
+    
+    const int64_t C_out = C_in * patch_size * patch_size;
+    const int64_t H_out = H_in / patch_size;
+    const int64_t W_out = W_in / patch_size;
+
+    const char* src_base   = (char*)latents->data;
+    const size_t elem_size = latents->nb[0];
+
+    const size_t src_stride_w = latents->nb[0];
+    const size_t src_stride_h = latents->nb[1];
+    const size_t src_stride_c = latents->nb[2];
+    const size_t src_stride_n = latents->nb[3];
+
+    bool alloc_dst_buf  = dst_buf == nullptr;
+    size_t dst_buf_size = src_stride_n;
+    if (alloc_dst_buf) {
+        dst_buf = (char*)malloc(dst_buf_size);
+    }
+
+    char* dst_ptr = dst_buf;
+
+    const size_t src_step_h = src_stride_h * patch_size;
+    const size_t src_step_w = src_stride_w * patch_size;
+
+    for (int64_t n = 0; n < N; ++n) {
+        for (int64_t c = 0; c < C_out; ++c) {
+            int64_t c_rem = c % (patch_size * patch_size);
+            int64_t c_in  = c / (patch_size * patch_size);
+            int64_t py    = c_rem / patch_size;
+            int64_t px    = c_rem % patch_size;
+
+            const char* src_layer = src_base + n * src_stride_n + c_in * src_stride_c + py * src_stride_h + px * src_stride_w;
+
+            for (int64_t y = 0; y < H_out; ++y) {
+                const char* src_row = src_layer + y * src_step_h;
+
+                for (int64_t x = 0; x < W_out; ++x) {
+                    memcpy(dst_ptr, src_row + x * src_step_w, elem_size);
+                    dst_ptr += elem_size;
+                }
+            }
+        }
+    }
+
+    memcpy(latents->data, dst_buf, dst_buf_size);
+
+    latents->ne[0] = W_out;
+    latents->ne[1] = H_out;
+    latents->ne[2] = C_out;
+
+    latents->nb[0] = elem_size;
+    latents->nb[1] = latents->nb[0] * W_out;
+    latents->nb[2] = latents->nb[1] * H_out;
+    latents->nb[3] = latents->nb[2] * C_out;
+
+    if (alloc_dst_buf) {
+        free(dst_buf);
+    }
+}
+
 void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], const float latent_rgb_bias[3], int width, int height, int frames, int dim) {
     size_t buffer_head = 0;
     for (int k = 0; k < frames; k++) {
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
@@ -1321,63 +1321,14 @@ class StableDiffusionGGML {
             }
             if (patch_sz != 1) {
                 // unshuffle latents
-                const int64_t N    = latents->ne[3];
-                const int64_t C_in = latents->ne[2];
-                const int64_t H_in = latents->ne[1];
-                const int64_t W_in = latents->ne[0];
+                std::vector<char> dst_buffer(latents->nb[GGML_MAX_DIMS-1]);
+                char* dst_buf = dst_buffer.data();
 
-                const int64_t C_out = C_in / (patch_sz * patch_sz);
-                const int64_t H_out = H_in * patch_sz;
-                const int64_t W_out = W_in * patch_sz;
+                unpatchify_latents(latents, patch_sz, dst_buf);
 
-                const char* src_ptr = (char*)latents->data;
-                size_t elem_size    = latents->nb[0];
-
-                std::vector<char> dst_buffer(N * C_out * H_out * W_out * elem_size);
-                char* dst_base = dst_buffer.data();
-
-                size_t dst_stride_w = elem_size;
-                size_t dst_stride_h = dst_stride_w * W_out;
-                size_t dst_stride_c = dst_stride_h * H_out;
-                size_t dst_stride_n = dst_stride_c * C_out;
-
-                size_t dst_step_w = dst_stride_w * patch_sz;
-                size_t dst_step_h = dst_stride_h * patch_sz;
-
-                for (int64_t n = 0; n < N; ++n) {
-                    for (int64_t c = 0; c < C_in; ++c) {
-                        int64_t c_out = c / (patch_sz * patch_sz);
-                        int64_t rem   = c % (patch_sz * patch_sz);
-                        int64_t py    = rem / patch_sz;
-                        int64_t px    = rem % patch_sz;
-
-                        char* dst_layer = dst_base + n * dst_stride_n + c_out * dst_stride_c + py * dst_stride_h + px * dst_stride_w;
-
-                        for (int64_t y = 0; y < H_in; ++y) {
-                            char* dst_row = dst_layer + y * dst_step_h;
-
-                            for (int64_t x = 0; x < W_in; ++x) {
-                                memcpy(dst_row + x * dst_step_w, src_ptr, elem_size);
-                                src_ptr += elem_size;
-                            }
-                        }
-                    }
-                }
-
-                memcpy(latents->data, dst_buffer.data(), dst_buffer.size());
-
-                latents->ne[0] = W_out;
-                latents->ne[1] = H_out;
-                latents->ne[2] = C_out;
-
-                latents->nb[0] = dst_stride_w;
-                latents->nb[1] = dst_stride_h;
-                latents->nb[2] = dst_stride_c;
-                latents->nb[3] = dst_stride_n;
-
-                width  = W_out;
-                height = H_out;
-                dim    = C_out;
+                width  = latents->ne[0];
+                height = latents->ne[1];
+                dim    = latents->ne[ggml_n_dims(latents) - 1];
             }
             const float (*latent_rgb_proj)[channel] = nullptr;
             float* latent_rgb_bias                  = nullptr;
@@ -1453,59 +1404,10 @@ class StableDiffusionGGML {
 
             if (patch_sz != 1) {
                 // restore shuffled latents
-                const int64_t N        = latents->ne[3];
-                const int64_t C_in     = latents->ne[2];
-                const int64_t H_in     = latents->ne[1];
-                const int64_t W_in     = latents->ne[0];
-
-                const int64_t C_out = C_in * patch_sz * patch_sz;
-                const int64_t H_out   = H_in / patch_sz;
-                const int64_t W_out   = W_in / patch_sz;
-
-                const char* src_base   = (char*)latents->data;
-                const size_t elem_size = latents->nb[0];
-
-                const size_t src_stride_w = latents->nb[0];
-                const size_t src_stride_h = latents->nb[1];
-                const size_t src_stride_c = latents->nb[2];
-                const size_t src_stride_n = latents->nb[3];
-
-                std::vector<char> dst_buffer(N * C_out * H_out * W_out * elem_size);
-                char* dst_ptr = dst_buffer.data();
-
-                const size_t src_step_h = src_stride_h * patch_sz;
-                const size_t src_step_w = src_stride_w * patch_sz;
-
-                for (int64_t n = 0; n < N; ++n) {
-                    for (int64_t c = 0; c < C_out; ++c) {
-                        int64_t c_rem = c % (patch_sz * patch_sz);
-                        int64_t c_in  = c / (patch_sz * patch_sz);
-                        int64_t py    = c_rem / patch_sz;
-                        int64_t px    = c_rem % patch_sz;
-
-                        const char* src_layer = src_base + n * src_stride_n + c_in * src_stride_c + py * src_stride_h + px * src_stride_w;
-
-                        for (int64_t y = 0; y < H_out; ++y) {
-                            const char* src_row = src_layer + y * src_step_h;
-
-                            for (int64_t x = 0; x < W_out; ++x) {
-                                memcpy(dst_ptr, src_row + x * src_step_w, elem_size);
-                                dst_ptr += elem_size;
-                            }
-                        }
-                    }
-                }
-
-                memcpy(latents->data, dst_buffer.data(), dst_buffer.size());
-
-                latents->ne[0] = W_out;
-                latents->ne[1] = H_out;
-                latents->ne[2] = C_out;
+                std::vector<char> dst_buffer(latents->nb[GGML_MAX_DIMS-1]);
+                char* dst_buf = dst_buffer.data();
 
-                latents->nb[0] = elem_size;
-                latents->nb[1] = latents->nb[0] * W_out;
-                latents->nb[2] = latents->nb[1] * H_out;
-                latents->nb[3] = latents->nb[2] * C_out;
+                repatchify_latents(latents, patch_sz, dst_buf);
             }
         } else {
             if (preview_mode == PREVIEW_VAE) {