Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python

name: Python package

on:
push:
branches: [ "main" ]
pull_request:
branches: [ "main" ]

jobs:
build:

runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python-version: ["3.9", "3.10", "3.11"]

steps:
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v3
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install flake8 pytest
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Test with pytest
run: |
pytest
8 changes: 5 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -158,8 +158,10 @@ cython_debug/
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
.vscode/launch.json

.idea/
.vscode/

temp.pdf
google-cloud-sdk
google-cloud-cli-469.0.0-linux-x86_64.tar.gz
Expand All @@ -173,4 +175,4 @@ google-cloud-cli-linux-x86_64.tar.gz
newenv
files
startupbackend.sh
startupfrontend.sh
startupfrontend.sh
3 changes: 3 additions & 0 deletions .idea/.gitignore

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/inspectionProfiles/profiles_settings.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

15 changes: 15 additions & 0 deletions .idea/llm-graph-builder.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 7 additions & 0 deletions .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions .idea/modules.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/vcs.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{
}
58 changes: 58 additions & 0 deletions backend/.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
OPENAI_API_KEY = "" #This is required if you are using openai embedding model
EMBEDDING_MODEL = "all-MiniLM-L6-v2" #this can be openai or vertexai or by default all-MiniLM-L6-v2
RAGAS_EMBEDDING_MODEL = "openai" #Keep blank if you want to use all-MiniLM-L6-v2 for ragas embeddings
IS_EMBEDDING = "TRUE"
KNN_MIN_SCORE = "0.94"
# Enable Gemini (default is False) | Can be False or True
GEMINI_ENABLED = False
# Enable Google Cloud logs (default is False) | Can be False or True
GCP_LOG_METRICS_ENABLED = False
NUMBER_OF_CHUNKS_TO_COMBINE = 6
UPDATE_GRAPH_CHUNKS_PROCESSED = 20
NEO4J_URI = ""
NEO4J_USERNAME = ""
NEO4J_PASSWORD = ""
NEO4J_DATABASE = ""
AWS_ACCESS_KEY_ID = ""
AWS_SECRET_ACCESS_KEY = ""
LANGCHAIN_API_KEY = ""
LANGCHAIN_PROJECT = ""
LANGCHAIN_TRACING_V2 = ""
LANGCHAIN_ENDPOINT = ""
GCS_FILE_CACHE = "" #save the file into GCS or local, SHould be True or False
NEO4J_USER_AGENT=""
ENABLE_USER_AGENT = ""
LLM_MODEL_CONFIG_model_version=""
ENTITY_EMBEDDING="TRUE" # TRUE or FALSE based on whether to create embeddings for entities suitable for entity vector mode
DUPLICATE_SCORE_VALUE =0.97
DUPLICATE_TEXT_DISTANCE =3
DEFAULT_DIFFBOT_CHAT_MODEL="openai_gpt_4o" #whichever model specified here , need to add config for that model in below format)
#examples
LLM_MODEL_CONFIG_openai_gpt_3.5="gpt-3.5-turbo-0125,openai_api_key"
LLM_MODEL_CONFIG_openai_gpt_4o_mini="gpt-4o-mini-2024-07-18,openai_api_key"
LLM_MODEL_CONFIG_openai_gpt_4o="gpt-4o-2024-11-20,openai_api_key"
LLM_MODEL_CONFIG_openai_gpt_4.1_mini="gpt-4.1-mini,openai_api_key"
LLM_MODEL_CONFIG_openai_gpt_4.1="gpt-4.1,openai_api_key"
LLM_MODEL_CONFIG_openai_gpt_o3_mini="o3-mini-2025-01-31,openai_api_key"
LLM_MODEL_CONFIG_gemini_1.5_pro="gemini-1.5-pro-002"
LLM_MODEL_CONFIG_gemini_1.5_flash="gemini-1.5-flash-002"
LLM_MODEL_CONFIG_gemini_2.0_flash="gemini-2.0-flash-001"
LLM_MODEL_CONFIG_gemini_2.5_pro="gemini-2.5-pro-exp-03-25"
LLM_MODEL_CONFIG_diffbot="diffbot,diffbot_api_key"
LLM_MODEL_CONFIG_azure_ai_gpt_35="azure_deployment_name,azure_endpoint or base_url,azure_api_key,api_version"
LLM_MODEL_CONFIG_azure_ai_gpt_4o="gpt-4o,https://YOUR-ENDPOINT.openai.azure.com/,azure_api_key,api_version"
LLM_MODEL_CONFIG_groq_llama3_70b="model_name,base_url,groq_api_key"
LLM_MODEL_CONFIG_anthropic_claude_3_5_sonnet="model_name,anthropic_api_key"
LLM_MODEL_CONFIG_fireworks_llama4_maverick="model_name,fireworks_api_key"
LLM_MODEL_CONFIG_bedrock_claude_3_5_sonnet="model_name,aws_access_key_id,aws_secret__access_key,region_name"
LLM_MODEL_CONFIG_ollama_llama3="model_name,model_local_url"
YOUTUBE_TRANSCRIPT_PROXY="https://user:pass@domain:port"
EFFECTIVE_SEARCH_RATIO=5
GRAPH_CLEANUP_MODEL="openai_gpt_4o"
BEDROCK_EMBEDDING_MODEL="model_name,aws_access_key,aws_secret_key,region_name" #model_name="amazon.titan-embed-text-v1"
LLM_MODEL_CONFIG_bedrock_nova_micro_v1="model_name,aws_access_key,aws_secret_key,region_name" #model_name="amazon.nova-micro-v1:0"
LLM_MODEL_CONFIG_bedrock_nova_lite_v1="model_name,aws_access_key,aws_secret_key,region_name" #model_name="amazon.nova-lite-v1:0"
LLM_MODEL_CONFIG_bedrock_nova_pro_v1="model_name,aws_access_key,aws_secret_key,region_name" #model_name="amazon.nova-pro-v1:0"
LLM_MODEL_CONFIG_fireworks_deepseek_r1="model_name,fireworks_api_key" #model_name="accounts/fireworks/models/deepseek-r1"
LLM_MODEL_CONFIG_fireworks_deepseek_v3="model_name,fireworks_api_key" #model_name="accounts/fireworks/models/deepseek-v3"
MAX_TOKEN_CHUNK_SIZE=2000 #Max token used to process/extract the file content.
2 changes: 1 addition & 1 deletion backend/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ Follow these steps to set up and run the project locally:

2. Install Dependency :

> pip install -t requirements.txt
> pip install -r requirements.txt

## Run backend project using unicorn
Run the server:
Expand Down
4 changes: 1 addition & 3 deletions backend/constraints.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,2 @@
-f https://download.pytorch.org/whl/torch_stable.html
torch==2.3.1+cpu
torchvision==0.18.1+cpu
-f https://download.pytorch.org/whl/torch_stable.html
torchaudio==2.3.1+cpu
22 changes: 11 additions & 11 deletions backend/dbtest.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,18 @@

# Database configurations
neo4j_configurations = [
{
'name': 'Neo4j Config 1',
'NEO4J_URI': 'neo4j+s://73b760b4.databases.neo4j.io',
'NEO4J_USERNAME': 'neo4j',
'NEO4J_PASSWORD': 'HqwAzfG83XwcEQ-mvEG4yNpcRTHMpsgZaYW3qIGJh2I'
},
# {
# 'name': 'Neo4j Config 2',
# 'uri': 'bolt://another-host:7687',
# 'user': 'neo4j',
# 'password': 'password2'
# }
# 'name': 'Neo4j Config 1',
# 'NEO4J_URI': 'neo4j+s://73b760b4.databases.neo4j.io',
# 'NEO4J_USERNAME': 'neo4j',
# 'NEO4J_PASSWORD': 'HqwAzfG83XwcEQ-mvEG4yNpcRTHMpsgZaYW3qIGJh2I'
# },
{
'name': 'Neo4j Config 2',
'uri': 'bolt://localhost:7687',
'user': 'neo4j',
'password': 'test1234'
}
]

# Function to create a Neo4j driver
Expand Down
4 changes: 4 additions & 0 deletions backend/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -62,3 +62,7 @@ rouge_score==0.1.2
langchain-neo4j==0.4.0
pypandoc-binary==1.15
chardet==5.2.0
torch==2.7.0
torchvision==0.22.0
protobuf==5.29.4

48 changes: 45 additions & 3 deletions backend/src/make_relationships.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import hashlib
import time
from langchain_neo4j import Neo4jVector
import re

logging.basicConfig(format='%(asctime)s - %(message)s',level='INFO')

Expand All @@ -16,14 +17,16 @@

def merge_relationship_between_chunk_and_entites(graph: Neo4jGraph, graph_documents_chunk_chunk_Id : list):
batch_data = []
logging.info("Create HAS_ENTITY relationship between chunks and entities")
logging.info("Create HAS_ENTITY relationship between chunks and entities (with semantic normalization)")

for graph_doc_chunk_id in graph_documents_chunk_chunk_Id:
for node in graph_doc_chunk_id['graph_doc'].nodes:
# Use normalization and semantic linking
canonical_id = create_or_link_entity_node(graph, node.id, node.type)
query_data={
'chunk_id': graph_doc_chunk_id['chunk_id'],
'node_type': node.type,
'node_id': node.id
'node_id': canonical_id
}
batch_data.append(query_data)

Expand Down Expand Up @@ -176,4 +179,43 @@ def create_chunk_vector_index(graph):
if ("EquivalentSchemaRuleAlreadyExists" in str(e) or "An equivalent index already exists" in str(e)):
logging.info("Vector index already exists, skipping creation.")
else:
raise
raise

def normalize_entity_name(name: str) -> str:
"""Normalizes entity names (removes articles, lowercases, trims whitespace)."""
name = name.strip().lower()
# Remove German and English articles at the beginning
name = re.sub(r'^(der|die|das|the|ein|eine|a|an) ', '', name)
# Optional: singularization, synonym replacement etc. (placeholder)
return name


def create_or_link_entity_node(graph: Neo4jGraph, entity_name: str, entity_type: str = "Entity"):
"""
Searches for existing entities with normalized name. If found, creates a :DERIVATIVE_OF relationship.
If not found, creates a new node.
Returns the node ID of the canonical node.
"""
norm_name = normalize_entity_name(entity_name)
# Search for existing node with normalized name
query = (
"MATCH (e:Entity) WHERE toLower(e.id) = $norm_name RETURN e LIMIT 1"
)
result = graph.query(query, {"norm_name": norm_name})
if result:
# Node exists, create DERIVATIVE_OF relationship if name differs
node_id = result[0]["e"].get("id")
if entity_name.strip().lower() != norm_name:
rel_query = (
"MATCH (orig:Entity {id: $orig_name}), (norm:Entity {id: $norm_name}) "
"MERGE (orig)-[:DERIVATIVE_OF]->(norm)"
)
graph.query(rel_query, {"orig_name": entity_name, "norm_name": norm_name})
return node_id
else:
# Node does not exist, create new one
create_query = (
"CREATE (e:Entity {id: $norm_name, original: $orig_name, type: $entity_type}) RETURN e.id AS id"
)
res = graph.query(create_query, {"norm_name": norm_name, "orig_name": entity_name, "entity_type": entity_type})
return res[0]["id"] if res else None
57 changes: 57 additions & 0 deletions backend/src/post_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,3 +234,60 @@ def graph_schema_consolidation(graph):
execute_graph_query(graph,query)

return None

def link_entities_with_context_and_physical_sync(graph):
"""
Links entities from chunks with the overall context and synchronizes physical entities.
Example: Checks if entities like 'Rewe' and 'Supermarket' are physically identical (e.g., same address) and creates SAME_AS relationships.
"""
# 1. Retrieve all entities from chunks
query = '''
MATCH (c:Chunk)-[:CONTAINS]->(e)
RETURN DISTINCT e.id AS entity_id, e.name AS name, e.type AS type, e.address AS address, elementId(e) AS eid
'''
entities = execute_graph_query(graph, query)

# 2. Map potential duplicates by address
entity_map = {}
for ent in entities:
key = (ent.get('address') or '').strip().lower()
if key:
if key not in entity_map:
entity_map[key] = []
entity_map[key].append(ent)

# 3. For each group with the same address, create SAME_AS relationships
for addr, ents in entity_map.items():
if len(ents) > 1:
for i in range(len(ents)):
for j in range(i+1, len(ents)):
eid1 = ents[i]['eid']
eid2 = ents[j]['eid']
sameas_query = f"""
MATCH (a), (b)
WHERE elementId(a) = '{eid1}' AND elementId(b) = '{eid2}'
MERGE (a)-[:SAME_AS]->(b)
"""
execute_graph_query(graph, sameas_query)
# 4. Additional rule: If name and type are identical but address is missing, also link as SAME_AS
name_type_map = {}
for ent in entities:
if not (ent.get('address') or '').strip():
key = (ent.get('name', '').strip().lower(), ent.get('type', '').strip().lower())
if all(key):
if key not in name_type_map:
name_type_map[key] = []
name_type_map[key].append(ent)
for key, ents in name_type_map.items():
if len(ents) > 1:
for i in range(len(ents)):
for j in range(i+1, len(ents)):
eid1 = ents[i]['eid']
eid2 = ents[j]['eid']
sameas_query = f"""
MATCH (a), (b)
WHERE elementId(a) = '{eid1}' AND elementId(b) = '{eid2}'
MERGE (a)-[:SAME_AS]->(b)
"""
execute_graph_query(graph, sameas_query)
return True
Binary file added data/data_Football_news.pdf
Binary file not shown.
20 changes: 20 additions & 0 deletions frontend/.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
VITE_BACKEND_API_URL="http://localhost:8000"
VITE_BLOOM_URL="https://workspace-preview.neo4j.io/workspace/explore?connectURL={CONNECT_URL}&search=Show+me+a+graph&featureGenAISuggestions=true&featureGenAISuggestionsInternal=true"
VITE_REACT_APP_SOURCES="local,youtube,wiki,s3,web"
VITE_LLM_MODELS="diffbot,openai_gpt_3.5,openai_gpt_4o"
VITE_ENV="DEV"
VITE_TIME_PER_PAGE=50
VITE_CHUNK_SIZE=5242880
VITE_CHUNK_OVERLAP=20
VITE_TOKENS_PER_CHUNK=100
VITE_CHUNK_TO_COMBINE=1
VITE_LARGE_FILE_SIZE=5242880
VITE_GOOGLE_CLIENT_ID=""
VITE_CHAT_MODES=""
VITE_BATCH_SIZE=2
VITE_LLM_MODELS_PROD="openai_gpt_4o,openai_gpt_4o_mini,diffbot,gemini_1.5_flash"
VITE_FRONTEND_HOSTNAME="localhost:8080"
VITE_SEGMENT_API_URL=""
VITE_AUTH0_CLIENT_ID=""
VITE_AUTH0_DOMAIN=""
VITE_SKIP_AUTH=true
Loading