Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
e82c32f
feat: partial ragas intergation
kevinagyeman Nov 23, 2025
6a651af
Merge remote-tracking branch 'origin/develop' into ragas-integration-…
kevinagyeman Nov 23, 2025
f7a53ca
feat: ragas integration (partial)
kevinagyeman Nov 23, 2025
5ea1e61
feat: partial ragas integration wip
kevinagyeman Nov 24, 2025
a41d158
edit: add .worktrees/ to gitignore
nicofretti Nov 25, 2025
c814c1b
fix: theme + storybook
nicofretti Nov 25, 2025
a9fe6fd
fix: bug config pipeline
nicofretti Nov 27, 2025
5562d01
edit: improve generator validator
nicofretti Nov 27, 2025
20d7202
fix: stop job handling
nicofretti Nov 27, 2025
20c8d1e
Merge remote-tracking branch 'origin/feat/replace-modals' into ragas-…
kevinagyeman Nov 28, 2025
918f89e
feat: integration of answer_relevancy and context_precision metrics
kevinagyeman Nov 28, 2025
783eadc
feat: aggregate multiple ragas metrics
kevinagyeman Dec 5, 2025
46452fc
feat: ragas integration completion
kevinagyeman Dec 14, 2025
157af89
Merge remote-tracking branch 'origin/develop' into ragas-integration-…
kevinagyeman Dec 16, 2025
8caaf62
fix: update blocks to use BlockExecutionContext pattern and add depen…
kevinagyeman Dec 16, 2025
ceda594
fix: address copilot code review feedback
kevinagyeman Dec 19, 2025
d0f3601
wip: fixing the ragas block + field mapper
nicofretti Dec 26, 2025
06d553c
wip: fixing ragas block
nicofretti Dec 27, 2025
1f03105
fix: langfuse error
nicofretti Dec 27, 2025
da1196b
fix: integration ragas
nicofretti Jan 5, 2026
1f10c30
fix: renaming + review
nicofretti Jan 6, 2026
57ad2c0
fix: format
nicofretti Jan 6, 2026
350374c
fix: missing import + doc
nicofretti Jan 6, 2026
fb5221f
fix: pre-merge + docs
nicofretti Jan 6, 2026
52a1645
fix: ui view
nicofretti Jan 6, 2026
627daf2
edit: changelog
nicofretti Jan 6, 2026
2249e1a
fix: pipeline errors
nicofretti Jan 6, 2026
8acc9c0
fix: pipeline
nicofretti Jan 6, 2026
7e0b20c
edit: import on ragas
nicofretti Jan 6, 2026
735107a
fix: debug_pipeline.py
nicofretti Jan 6, 2026
b9c3339
Merge branch 'develop' into ragas-integration-block
nicofretti Jan 6, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,24 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [1.3.0] - 2026-01-06 🚀

### Added
- Pipeline execution constraints for better control over pipeline runs
- Integration with Langfuse for observability and monitoring
- RAGAS metrics integration for RAG evaluation
- Modal components for improved UI interactions
- Block configuration view for enhanced block setup
- Generator view for better visualization

### Changed
- Refactored codebase with Pydantic for improved data validation and type safety
- Enhanced custom components theming

### Fixed
- Job cancellation bug that prevented proper pipeline stopping
- Langfuse dataset naming issues

## [1.2.0] - 2025-11-17 🚀

### Added
Expand Down Expand Up @@ -49,6 +67,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- REST API for pipeline execution
- Template system for common use cases

[1.3.0]: https://github.com/nicofretti/DataGenFlow/compare/release-v1.2.0...release-v1.3.0
[1.2.0]: https://github.com/nicofretti/DataGenFlow/compare/release-v1.1.0...release-v1.2.0
[1.1.0]: https://github.com/nicofretti/DataGenFlow/compare/release...release-v1.1.0
[1.0.0]: https://github.com/nicofretti/DataGenFlow/releases/tag/release
54 changes: 46 additions & 8 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,19 +54,48 @@ def is_multiplier_pipeline(blocks: list[dict[str, Any]]) -> bool:
return getattr(block_class, "is_multiplier", False)


def _patch_langfuse_usage_bug() -> None:
"""patch litellm langfuse bug where .get() is called on pydantic model instead of dict"""
try:
from litellm.types.utils import CompletionUsage

if not hasattr(CompletionUsage, "get"):
# add get method so pydantic model works like dict
def pydantic_get(self, key, default=None):
return getattr(self, key, default)

CompletionUsage.get = pydantic_get
except (ImportError, AttributeError):
# best-effort patch: if litellm or CompletionUsage is unavailable or changed,
# simply skip applying the compatibility shim and continue without failing.
logger.warning(
"Skipping Langfuse usage patch: litellm or CompletionUsage is unavailable "
"or has an unexpected structure."
)


@asynccontextmanager
async def lifespan(app: FastAPI) -> AsyncIterator[None]:
import os

import litellm

from lib.blocks.commons import UsageTracker

await storage.init_db()

# configure langfuse integration if credentials are set
# patch langfuse bug before enabling it
_patch_langfuse_usage_bug()

# configure langfuse integration and usage tracking
# note: litellm.callbacks is for custom callbacks, success_callback is for built-in integrations
if os.getenv("LANGFUSE_PUBLIC_KEY") and os.getenv("LANGFUSE_SECRET_KEY"):
litellm.success_callback = ["langfuse"]
logger.info("Langfuse observability enabled")

# always register usage tracker via callbacks (works for all LLM calls including RAGAS)
litellm.callbacks = [UsageTracker.callback]

yield
# close storage connection on shutdown
await storage.close()
Expand Down Expand Up @@ -455,17 +484,26 @@ async def list_blocks() -> list[dict[str, Any]]:
"""list all registered blocks with dynamically injected model options"""
blocks = registry.list_blocks()

# get available llm models
# get available llm and embedding models
llm_models = await llm_config_manager.list_llm_models()
embedding_models = await llm_config_manager.list_embedding_models()
model_names = [model.name for model in llm_models]
embedding_names = [model.name for model in embedding_models]

# inject model options into TextGenerator and StructuredGenerator schemas
# inject model options into block schemas
for block in blocks:
if block.get("type") in ["TextGenerator", "StructuredGenerator"]:
if "config_schema" in block and "properties" in block["config_schema"]:
if "model" in block["config_schema"]["properties"]:
# add enum with available model names
block["config_schema"]["properties"]["model"]["enum"] = model_names
block_type = block.get("type")
props = block.get("config_schema", {}).get("properties", {})

# inject LLM model options
if block_type in ["TextGenerator", "StructuredGenerator", "RagasMetrics"]:
if "model" in props:
props["model"]["enum"] = model_names

# inject embedding model options for RagasMetrics
if block_type == "RagasMetrics":
if "embedding_model" in props:
props["embedding_model"]["enum"] = embedding_names

return blocks

Expand Down
65 changes: 57 additions & 8 deletions debug_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,55 @@
import asyncio
import logging
import os

from lib.storage import Storage
from lib.workflow import Pipeline as WorkflowPipeline
import litellm

PIPELINE_ID = 26
# disable ragas analytics (must be before any ragas imports)
os.environ["RAGAS_DO_NOT_TRACK"] = "true"

# suppress asyncio SSL errors at shutdown (harmless cleanup noise from pending HTTP connections)
logging.getLogger("asyncio").setLevel(logging.CRITICAL)


def _patch_langfuse_usage_bug() -> None:
"""patch litellm langfuse bug where .get() is called on pydantic model"""
try:
from litellm.types.utils import CompletionUsage

if not hasattr(CompletionUsage, "get"):

def pydantic_get(self, key, default=None):
return getattr(self, key, default)

CompletionUsage.get = pydantic_get
except (ImportError, AttributeError):
pass


# apply patch before any litellm callbacks
_patch_langfuse_usage_bug()

from lib.blocks.commons import UsageTracker # noqa: E402
from lib.storage import Storage # noqa: E402
from lib.workflow import Pipeline as WorkflowPipeline # noqa: E402

# setup logging
logging.basicConfig(level=logging.DEBUG)

# register usage tracker callback
litellm.success_callback = [UsageTracker.callback]
# also try callbacks list
litellm.callbacks = [UsageTracker.callback]

PIPELINE_ID = 92
SEED_DATA = {
"repetitions": 1,
"metadata": {
"content": (
"Electric cars reduce emissions but require extensive charging "
"infrastructure and have higher upfront costs compared to traditional vehicles."
"Python is a high-level, interpreted programming language known for "
"its clear syntax and readability. It was created by Guido van Rossum "
"and first released in 1991. Python supports multiple programming "
"paradigms including procedural, object-oriented, and functional."
)
},
}
Expand All @@ -26,9 +66,18 @@ async def main() -> None:

workflow = WorkflowPipeline(name=pipeline_data.name, blocks=pipeline_data.definition["blocks"])

result, _, trace_id = await workflow.execute(SEED_DATA["metadata"]) # type: ignore[arg-type]
print(f"trace_id: {trace_id}")
print(f"result: {result}")
execution_result = await workflow.execute(SEED_DATA["metadata"])
print(f"trace_id: {execution_result.trace_id}")
print(f"result: {execution_result.result}")
print(f"usage: {execution_result.usage}")

# shutdown ragas analytics batcher before event loop closes
try:
from ragas._analytics import _analytics_batcher

_analytics_batcher.shutdown()
except ImportError:
pass


if __name__ == "__main__":
Expand Down
112 changes: 112 additions & 0 deletions docs/ragas_evaluation.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
# RAGAS Evaluation Guide

## Overview

RAGAS (Retrieval Augmented Generation Assessment) is a framework for evaluating the quality of RAG-generated answers. The **RagasMetrics** block evaluates a single QA pair against multiple quality metrics.

## Metrics

### 1. Answer Relevancy
**What it measures**: How relevant the answer is to the question.

**Range**: 0.0 - 1.0 (higher is better)

**Requires**:
- question
- answer
- embeddings (configured via embedding model)

**Example**:
- Question: "What is the capital of France?"
- Answer: "Paris is the capital of France" -> High score (0.9+)
- Answer: "France is a European country" -> Low score (0.3-)

### 2. Faithfulness
**What it measures**: Whether the answer is factually consistent with the provided context.

**Range**: 0.0 - 1.0 (higher is better)

**Requires**:
- question
- answer
- contexts

**Example**:
- Context: "The Eiffel Tower is 330 meters tall"
- Answer: "The Eiffel Tower is 330 meters tall" -> High score (0.9+)
- Answer: "The Eiffel Tower is 500 meters tall" -> Low score (0.3-)

### 3. Context Precision
**What it measures**: Whether the relevant context chunks appear earlier in the context list.

**Range**: 0.0 - 1.0 (higher is better)

**Requires**:
- question
- contexts
- ground_truth

**Example**:
If the most relevant context appears first in the list -> High score
If relevant context is buried at the end -> Low score

### 4. Context Recall
**What it measures**: Whether all information needed to answer the question is present in the contexts.

**Range**: 0.0 - 1.0 (higher is better)

**Requires**:
- question
- contexts
- ground_truth

**Example**:
- Ground truth: "Paris is the capital of France, located on the Seine river"
- Context includes both facts -> High score (1.0)
- Context only includes capital fact -> Lower score (0.5)

## Configuration

### Field References

The block uses field references to locate data in the pipeline state:
- **question_field**: Field containing the question
- **answer_field**: Field containing the answer
- **contexts_field**: Field containing contexts (list of strings)
- **ground_truth_field**: Field containing expected answer

These are dropdowns populated from available pipeline fields, you can use the **FieldMapper** block to rename or create fields as needed (eg. extract fields from nasted structures).

### Selecting Metrics

Use the **metrics** multi-select to choose which metrics to compute:
- Check all metrics you want to evaluate
- Uncheck metrics you don't need
- Note: `answer_relevancy` requires an embedding model

### Score Threshold

The field **score_threshold** is the minimum value for each metric to be considered passing. The block outputs a boolean `passed` indicating if all selected metrics meet or exceed this threshold.


### Model Configuration

- **model**: LLM model for evaluation (leave empty for pipeline default)
- **embedding_model**: Embedding model for answer_relevancy (leave empty for default)

## Output Format

The block outputs a single `ragas_scores` object:

```json
{
...
"ragas_scores": {
"answer_relevancy": 0.92,
"faithfulness": 0.88,
"context_precision": 0.95,
"context_recall": 0.85,
"passed": true
},
}
```
42 changes: 42 additions & 0 deletions frontend/src/components/pipeline-editor/BlockConfigPanel.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,48 @@ export default function BlockConfigPanel({
return <Checkbox checked={value} onChange={(e) => handleChange(key, e.target.checked)} />;
}

// multi-select enum (array with enum items)
if (schema.type === "array" && schema.items?.enum && Array.isArray(schema.items.enum)) {
const selectedValues = Array.isArray(value) ? value : [];
const options = schema.items.enum;

const handleToggle = (option: string) => {
const newValues = selectedValues.includes(option)
? selectedValues.filter((v) => v !== option)
: [...selectedValues, option];
handleChange(key, newValues);
};

return (
<Box
sx={{
display: "flex",
flexDirection: "column",
gap: 2,
p: 2,
borderRadius: 2,
bg: "canvas.subtle",
}}
>
{options.map((option: string) => (
<Box key={option} sx={{ display: "flex", alignItems: "center", gap: 2 }}>
<Checkbox
checked={selectedValues.includes(option)}
onChange={() => handleToggle(option)}
/>
<Text
sx={{
color: "fg.default",
}}
>
{option}
</Text>
</Box>
))}
</Box>
);
}

// enum dropdown (predefined options)
if (schema.enum && Array.isArray(schema.enum)) {
return (
Expand Down
7 changes: 7 additions & 0 deletions frontend/src/components/pipeline-editor/PipelineEditor.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -436,6 +436,13 @@ export default function PipelineEditor({
availableFields.add(output);
}
});

// for FieldMapper, extract keys from mappings config
if (node.data.block.type === "FieldMapper" && node.data.config?.mappings) {
Object.keys(node.data.config.mappings).forEach((key) => {
availableFields.add(key);
});
}
}
});

Expand Down
Loading
Loading