Skip to content

Commit 2c2de89

Browse files
Add image-text-to-image and image-text-to-video tasks (#1866)
The goal of this new tasks is to support models that take in both image and text input and output either image or video. The goal of this PR is making the tasks as analogous to `image-to-image` and `image-to-video` as possible, with the only difference that the image input should now be **optional**, as an empty image and a valid prompt should still work for a model like FLUX.2 (supports both text-to-image and image-to-image tasks) or LTX Video (both text-to-video and image-to-video) Once this is in, I'll also have a widget PR in Moon to support this task in the model cards / widgets etc. and a follow up PR adding this to the inference providers, so that we can then PR repos to change the task for compatible models --------- Co-authored-by: Merve Noyan <merveenoyan@gmail.com>
1 parent c3e284e commit 2c2de89

File tree

19 files changed

+665
-0
lines changed

19 files changed

+665
-0
lines changed

packages/inference/src/lib/getProviderHelper.ts

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ import type {
2828
ImageToImageTaskHelper,
2929
ImageToTextTaskHelper,
3030
ImageToVideoTaskHelper,
31+
ImageTextToImageTaskHelper,
32+
ImageTextToVideoTaskHelper,
3133
ObjectDetectionTaskHelper,
3234
QuestionAnsweringTaskHelper,
3335
SentenceSimilarityTaskHelper,
@@ -276,6 +278,14 @@ export function getProviderHelper(
276278
provider: InferenceProviderOrPolicy,
277279
task: "image-to-video"
278280
): ImageToVideoTaskHelper & TaskProviderHelper;
281+
export function getProviderHelper(
282+
provider: InferenceProviderOrPolicy,
283+
task: "image-text-to-image"
284+
): ImageTextToImageTaskHelper & TaskProviderHelper;
285+
export function getProviderHelper(
286+
provider: InferenceProviderOrPolicy,
287+
task: "image-text-to-video"
288+
): ImageTextToVideoTaskHelper & TaskProviderHelper;
279289
export function getProviderHelper(
280290
provider: InferenceProviderOrPolicy,
281291
task: "sentence-similarity"

packages/inference/src/providers/providerHelper.ts

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ import type {
1919
ImageToTextInput,
2020
ImageToTextOutput,
2121
ImageToVideoInput,
22+
ImageTextToImageInput,
23+
ImageTextToVideoInput,
2224
ObjectDetectionInput,
2325
ObjectDetectionOutput,
2426
QuestionAnsweringInput,
@@ -54,6 +56,8 @@ import { toArray } from "../utils/toArray.js";
5456
import type { ImageToImageArgs } from "../tasks/cv/imageToImage.js";
5557
import type { AutomaticSpeechRecognitionArgs } from "../tasks/audio/automaticSpeechRecognition.js";
5658
import type { ImageToVideoArgs } from "../tasks/cv/imageToVideo.js";
59+
import type { ImageTextToImageArgs } from "../tasks/cv/imageTextToImage.js";
60+
import type { ImageTextToVideoArgs } from "../tasks/cv/imageTextToVideo.js";
5761
import type { ImageSegmentationArgs } from "../tasks/cv/imageSegmentation.js";
5862

5963
/**
@@ -159,6 +163,18 @@ export interface ImageToVideoTaskHelper {
159163
preparePayloadAsync(args: ImageToVideoArgs): Promise<RequestArgs>;
160164
}
161165

166+
export interface ImageTextToImageTaskHelper {
167+
getResponse(response: unknown, url?: string, headers?: HeadersInit): Promise<Blob>;
168+
preparePayload(params: BodyParams<ImageTextToImageInput & BaseArgs>): Record<string, unknown>;
169+
preparePayloadAsync(args: ImageTextToImageArgs): Promise<RequestArgs>;
170+
}
171+
172+
export interface ImageTextToVideoTaskHelper {
173+
getResponse(response: unknown, url?: string, headers?: HeadersInit): Promise<Blob>;
174+
preparePayload(params: BodyParams<ImageTextToVideoInput & BaseArgs>): Record<string, unknown>;
175+
preparePayloadAsync(args: ImageTextToVideoArgs): Promise<RequestArgs>;
176+
}
177+
162178
export interface ImageSegmentationTaskHelper {
163179
getResponse(response: unknown, url?: string, headers?: HeadersInit): Promise<ImageSegmentationOutput>;
164180
preparePayload(params: BodyParams<ImageSegmentationInput & BaseArgs>): Record<string, unknown> | BodyInit;

packages/inference/src/snippets/getInferenceSnippets.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,10 @@ const HF_PYTHON_METHODS: Partial<Record<WidgetType, string>> = {
9696
"image-classification": "image_classification",
9797
"image-segmentation": "image_segmentation",
9898
"image-to-image": "image_to_image",
99+
"image-to-video": "image_to_video",
99100
"image-to-text": "image_to_text",
101+
"image-text-to-image": "image_text_to_image",
102+
"image-text-to-video": "image_text_to_video",
100103
"object-detection": "object_detection",
101104
"question-answering": "question_answering",
102105
"sentence-similarity": "sentence_similarity",
@@ -390,7 +393,9 @@ const snippets: Partial<
390393
"fill-mask": snippetGenerator("basic"),
391394
"image-classification": snippetGenerator("basicImage"),
392395
"image-segmentation": snippetGenerator("basicImage"),
396+
"image-text-to-image": snippetGenerator("imageToImage", prepareImageToImageInput),
393397
"image-text-to-text": snippetGenerator("conversational"),
398+
"image-text-to-video": snippetGenerator("imageToVideo", prepareImageToImageInput),
394399
"image-to-image": snippetGenerator("imageToImage", prepareImageToImageInput),
395400
"image-to-text": snippetGenerator("basicImage"),
396401
"image-to-video": snippetGenerator("imageToVideo", prepareImageToImageInput),
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import type { ImageTextToImageInput } from "@huggingface/tasks";
2+
import { resolveProvider } from "../../lib/getInferenceProviderMapping.js";
3+
import { getProviderHelper } from "../../lib/getProviderHelper.js";
4+
import type { BaseArgs, Options } from "../../types.js";
5+
import { innerRequest } from "../../utils/request.js";
6+
7+
export type ImageTextToImageArgs = BaseArgs & ImageTextToImageInput;
8+
9+
/**
10+
* This task takes an image and text input and outputs a new generated image.
11+
* Recommended model: black-forest-labs/FLUX.2-dev
12+
*/
13+
export async function imageTextToImage(args: ImageTextToImageArgs, options?: Options): Promise<Blob> {
14+
const provider = await resolveProvider(args.provider, args.model, args.endpointUrl);
15+
const providerHelper = getProviderHelper(provider, "image-text-to-image");
16+
const payload = await providerHelper.preparePayloadAsync(args);
17+
const { data: res, requestContext } = await innerRequest<Blob>(payload, providerHelper, {
18+
...options,
19+
task: "image-text-to-image",
20+
});
21+
return providerHelper.getResponse(res, requestContext.url, requestContext.info.headers as Record<string, string>);
22+
}
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import type { ImageTextToVideoInput } from "@huggingface/tasks";
2+
import { resolveProvider } from "../../lib/getInferenceProviderMapping.js";
3+
import { getProviderHelper } from "../../lib/getProviderHelper.js";
4+
import type { BaseArgs, Options } from "../../types.js";
5+
import { innerRequest } from "../../utils/request.js";
6+
7+
export type ImageTextToVideoArgs = BaseArgs & ImageTextToVideoInput;
8+
9+
/**
10+
* This task takes an image and text input and outputs a generated video.
11+
* Recommended model: Lightricks/LTX-Video
12+
*/
13+
export async function imageTextToVideo(args: ImageTextToVideoArgs, options?: Options): Promise<Blob> {
14+
const provider = await resolveProvider(args.provider, args.model, args.endpointUrl);
15+
const providerHelper = getProviderHelper(provider, "image-text-to-video");
16+
const payload = await providerHelper.preparePayloadAsync(args);
17+
const { data: res, requestContext } = await innerRequest<Blob>(payload, providerHelper, {
18+
...options,
19+
task: "image-text-to-video",
20+
});
21+
return providerHelper.getResponse(res, requestContext.url, requestContext.info.headers as Record<string, string>);
22+
}

packages/inference/src/tasks/index.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ export * from "./cv/imageSegmentation.js";
1414
export * from "./cv/imageToImage.js";
1515
export * from "./cv/imageToText.js";
1616
export * from "./cv/imageToVideo.js";
17+
export * from "./cv/imageTextToImage.js";
18+
export * from "./cv/imageTextToVideo.js";
1719
export * from "./cv/objectDetection.js";
1820
export * from "./cv/textToImage.js";
1921
export * from "./cv/textToVideo.js";

packages/tasks/src/pipelines.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -557,6 +557,14 @@ export const PIPELINE_DATA = {
557557
name: "Image-Text-to-Text",
558558
modality: "multimodal",
559559
},
560+
"image-text-to-image": {
561+
name: "Image-Text-to-Image",
562+
modality: "multimodal",
563+
},
564+
"image-text-to-video": {
565+
name: "Image-Text-to-Video",
566+
modality: "multimodal",
567+
},
560568
"visual-question-answering": {
561569
name: "Visual Question Answering",
562570
subtasks: [

packages/tasks/src/snippets/inputs.ts

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,16 @@ const inputsImageToVideo = () => `{
9494
"prompt": "The cat starts to dance"
9595
}`;
9696

97+
const inputsImageTextToImage = () => `{
98+
"image": "cat.png",
99+
"prompt": "Turn the cat into a tiger."
100+
}`;
101+
102+
const inputsImageTextToVideo = () => `{
103+
"image": "cat.png",
104+
"prompt": "The cat starts to dance"
105+
}`;
106+
97107
const inputsImageSegmentation = () => `"cats.jpg"`;
98108

99109
const inputsObjectDetection = () => `"cats.jpg"`;
@@ -130,6 +140,8 @@ const modelInputSnippets: {
130140
"image-to-text": inputsImageToText,
131141
"image-to-image": inputsImageToImage,
132142
"image-to-video": inputsImageToVideo,
143+
"image-text-to-image": inputsImageTextToImage,
144+
"image-text-to-video": inputsImageTextToVideo,
133145
"image-segmentation": inputsImageSegmentation,
134146
"object-detection": inputsObjectDetection,
135147
"question-answering": inputsQuestionAnswering,
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
## Use Cases
2+
3+
### Instruction-based Image Editing
4+
5+
Image-text-to-image models can be used to edit images based on natural language instructions. For example, you can provide an image of a summer landscape and the instruction "Make it winter, add snow" to generate a winter version of the same scene.
6+
7+
### Style Transfer
8+
9+
These models can apply artistic styles or transformations to images based on text descriptions. For instance, you can transform a photo into a painting style by providing prompts like "Make it look like a Van Gogh painting" or "Convert to watercolor style."
10+
11+
### Image Variations
12+
13+
Generate variations of an existing image by providing different text prompts. This is useful for creative workflows where you want to explore different versions of the same image with specific modifications.
14+
15+
### Guided Image Generation
16+
17+
Use a reference image along with text prompts to guide the generation process. This allows for more controlled image generation compared to text-to-image models alone, as the reference image provides structural guidance.
18+
19+
### Image Inpainting and Outpainting
20+
21+
Fill in missing or masked parts of an image based on text descriptions, or extend an image beyond its original boundaries with text-guided generation.
22+
23+
## Task Variants
24+
25+
### Instruction-based Editing
26+
27+
Models that follow natural language instructions to edit images, which can perform complex edits like object removal, color changes, and compositional modifications.
28+
29+
### Reference-guided Generation
30+
31+
Models that use a reference image to guide the generation process while incorporating text prompts to control specific attributes or modifications.
32+
33+
### Conditional Image-to-Image
34+
35+
Models that perform specific transformations based on text conditions, such as changing weather conditions, time of day, or seasonal variations.
36+
37+
## Inference
38+
39+
You can use the Diffusers library to interact with image-text-to-image models.
40+
41+
```python
42+
import torch
43+
from diffusers import Flux2Pipeline
44+
from diffusers.utils import load_image
45+
46+
repo_id = "black-forest-labs/FLUX.2-dev"
47+
device = "cuda:0"
48+
torch_dtype = torch.bfloat16
49+
50+
pipe = Flux2Pipeline.from_pretrained(
51+
repo_id, torch_dtype=torch_dtype
52+
)
53+
pipe.enable_model_cpu_offload() #no need to do cpu offload for >80G VRAM carts like H200, B200, etc. and do a `pipe.to(device)` instead
54+
55+
prompt = "Realistic macro photograph of a hermit crab using a soda can as its shell, partially emerging from the can, captured with sharp detail and natural colors, on a sunlit beach with soft shadows and a shallow depth of field, with blurred ocean waves in the background. The can has the text `BFL Diffusers` on it and it has a color gradient that start with #FF5733 at the top and transitions to #33FF57 at the bottom."
56+
57+
#cat_image = load_image("https://huggingface.co/spaces/zerogpu-aoti/FLUX.1-Kontext-Dev-fp8-dynamic/resolve/main/cat.png")
58+
image = pipe(
59+
prompt=prompt,
60+
#image=[cat_image] #multi-image input
61+
generator=torch.Generator(device=device).manual_seed(42),
62+
num_inference_steps=50,
63+
guidance_scale=4,
64+
).images[0]
65+
66+
image.save("flux2_output.png")
67+
```
68+
69+
## Useful Resources
70+
71+
- [FLUX.2 Model Card](https://huggingface.co/black-forest-labs/FLUX.2-dev)
72+
- [Diffusers documentation on Image-to-Image](https://huggingface.co/docs/diffusers/using-diffusers/img2img)
73+
- [ControlNet for Conditional Image Generation](https://huggingface.co/docs/diffusers/using-diffusers/controlnet)
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
import type { TaskDataCustom } from "../index.js";
2+
3+
const taskData: TaskDataCustom = {
4+
datasets: [],
5+
demo: {
6+
inputs: [
7+
{
8+
filename: "image-text-to-image-input.jpeg",
9+
type: "img",
10+
},
11+
{
12+
label: "Input",
13+
content: "A city above clouds, pastel colors, Victorian style",
14+
type: "text",
15+
},
16+
],
17+
outputs: [
18+
{
19+
filename: "image-text-to-image-output.png",
20+
type: "img",
21+
},
22+
],
23+
},
24+
metrics: [
25+
{
26+
description:
27+
"The Fréchet Inception Distance (FID) calculates the distance between distributions between synthetic and real samples. A lower FID score indicates better similarity between the distributions of real and generated images.",
28+
id: "FID",
29+
},
30+
{
31+
description:
32+
"CLIP Score measures the similarity between the generated image and the text prompt using CLIP embeddings. A higher score indicates better alignment with the text prompt.",
33+
id: "CLIP",
34+
},
35+
],
36+
models: [
37+
{
38+
description: "A powerful model for image-text-to-image generation.",
39+
id: "black-forest-labs/FLUX.2-dev",
40+
},
41+
],
42+
spaces: [
43+
{
44+
description: "An application for image-text-to-image generation.",
45+
id: "black-forest-labs/FLUX.2-dev",
46+
},
47+
],
48+
summary:
49+
"Image-text-to-image models take an image and a text prompt as input and generate a new image based on the reference image and text instructions. These models are useful for image editing, style transfer, image variations, and guided image generation tasks.",
50+
widgetModels: ["black-forest-labs/FLUX.2-dev"],
51+
youtubeId: undefined,
52+
};
53+
54+
export default taskData;

0 commit comments

Comments
 (0)