neo4j-labs · freshNfunky · May 18, 2025 · May 19, 2025 · May 19, 2025 · May 21, 2025
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -0,0 +1,40 @@
+# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
+
+name: Python package
+
+on:
+  push:
+    branches: [ "main" ]
+  pull_request:
+    branches: [ "main" ]
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.9", "3.10", "3.11"]
+
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v3
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        python -m pip install flake8 pytest
+        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+    - name: Lint with flake8
+      run: |
+        # stop the build if there are Python syntax errors or undefined names
+        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+    - name: Test with pytest
+      run: |
+        pytest
diff --git a/.gitignore b/.gitignore
@@ -158,8 +158,10 @@ cython_debug/
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
-.vscode/launch.json
+
+.idea/
+.vscode/
+
 temp.pdf
 google-cloud-sdk
 google-cloud-cli-469.0.0-linux-x86_64.tar.gz
@@ -173,4 +175,4 @@ google-cloud-cli-linux-x86_64.tar.gz
 newenv
 files
 startupbackend.sh
-startupfrontend.sh
+startupfrontend.sh
diff --git a/.idea/.gitignore b/.idea/.gitignore
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
diff --git a/.idea/llm-graph-builder.iml b/.idea/llm-graph-builder.iml
diff --git a/.idea/misc.xml b/.idea/misc.xml
diff --git a/.idea/modules.xml b/.idea/modules.xml
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,2 @@
+{
+}
diff --git a/backend/.env b/backend/.env
@@ -0,0 +1,58 @@
+OPENAI_API_KEY = ""   #This is required if you are using openai embedding model
+EMBEDDING_MODEL = "all-MiniLM-L6-v2"  #this can be openai or vertexai or by default all-MiniLM-L6-v2
+RAGAS_EMBEDDING_MODEL = "openai"  #Keep blank if you want to use all-MiniLM-L6-v2 for ragas embeddings
+IS_EMBEDDING = "TRUE"   
+KNN_MIN_SCORE = "0.94"
+# Enable Gemini (default is False) | Can be False or True
+GEMINI_ENABLED = False
+# Enable Google Cloud logs (default is False) | Can be False or True
+GCP_LOG_METRICS_ENABLED = False
+NUMBER_OF_CHUNKS_TO_COMBINE = 6
+UPDATE_GRAPH_CHUNKS_PROCESSED = 20
+NEO4J_URI = ""
+NEO4J_USERNAME = ""
+NEO4J_PASSWORD = ""
+NEO4J_DATABASE = ""
+AWS_ACCESS_KEY_ID =  ""
+AWS_SECRET_ACCESS_KEY = ""
+LANGCHAIN_API_KEY = ""
+LANGCHAIN_PROJECT = ""
+LANGCHAIN_TRACING_V2 = ""
+LANGCHAIN_ENDPOINT = ""
+GCS_FILE_CACHE = "" #save the file into GCS or local, SHould be True or False
+NEO4J_USER_AGENT=""
+ENABLE_USER_AGENT = ""
+LLM_MODEL_CONFIG_model_version=""
+ENTITY_EMBEDDING="TRUE"   # TRUE or FALSE based on whether to create embeddings for entities suitable for entity vector mode
+DUPLICATE_SCORE_VALUE =0.97
+DUPLICATE_TEXT_DISTANCE =3
+DEFAULT_DIFFBOT_CHAT_MODEL="openai_gpt_4o"  #whichever model specified here , need to add config for that model in below format)
+#examples
+LLM_MODEL_CONFIG_openai_gpt_3.5="gpt-3.5-turbo-0125,openai_api_key"
+LLM_MODEL_CONFIG_openai_gpt_4o_mini="gpt-4o-mini-2024-07-18,openai_api_key"
+LLM_MODEL_CONFIG_openai_gpt_4o="gpt-4o-2024-11-20,openai_api_key"
+LLM_MODEL_CONFIG_openai_gpt_4.1_mini="gpt-4.1-mini,openai_api_key"
+LLM_MODEL_CONFIG_openai_gpt_4.1="gpt-4.1,openai_api_key"
+LLM_MODEL_CONFIG_openai_gpt_o3_mini="o3-mini-2025-01-31,openai_api_key"
+LLM_MODEL_CONFIG_gemini_1.5_pro="gemini-1.5-pro-002"
+LLM_MODEL_CONFIG_gemini_1.5_flash="gemini-1.5-flash-002"
+LLM_MODEL_CONFIG_gemini_2.0_flash="gemini-2.0-flash-001"
+LLM_MODEL_CONFIG_gemini_2.5_pro="gemini-2.5-pro-exp-03-25"
+LLM_MODEL_CONFIG_diffbot="diffbot,diffbot_api_key"
+LLM_MODEL_CONFIG_azure_ai_gpt_35="azure_deployment_name,azure_endpoint or base_url,azure_api_key,api_version"
+LLM_MODEL_CONFIG_azure_ai_gpt_4o="gpt-4o,https://YOUR-ENDPOINT.openai.azure.com/,azure_api_key,api_version"
+LLM_MODEL_CONFIG_groq_llama3_70b="model_name,base_url,groq_api_key"
+LLM_MODEL_CONFIG_anthropic_claude_3_5_sonnet="model_name,anthropic_api_key"
+LLM_MODEL_CONFIG_fireworks_llama4_maverick="model_name,fireworks_api_key"
+LLM_MODEL_CONFIG_bedrock_claude_3_5_sonnet="model_name,aws_access_key_id,aws_secret__access_key,region_name"
+LLM_MODEL_CONFIG_ollama_llama3="model_name,model_local_url"
+YOUTUBE_TRANSCRIPT_PROXY="https://user:pass@domain:port"
+EFFECTIVE_SEARCH_RATIO=5
+GRAPH_CLEANUP_MODEL="openai_gpt_4o"
+BEDROCK_EMBEDDING_MODEL="model_name,aws_access_key,aws_secret_key,region_name"                       #model_name="amazon.titan-embed-text-v1"
+LLM_MODEL_CONFIG_bedrock_nova_micro_v1="model_name,aws_access_key,aws_secret_key,region_name"        #model_name="amazon.nova-micro-v1:0"
+LLM_MODEL_CONFIG_bedrock_nova_lite_v1="model_name,aws_access_key,aws_secret_key,region_name"         #model_name="amazon.nova-lite-v1:0"
+LLM_MODEL_CONFIG_bedrock_nova_pro_v1="model_name,aws_access_key,aws_secret_key,region_name"          #model_name="amazon.nova-pro-v1:0"
+LLM_MODEL_CONFIG_fireworks_deepseek_r1="model_name,fireworks_api_key"      #model_name="accounts/fireworks/models/deepseek-r1"
+LLM_MODEL_CONFIG_fireworks_deepseek_v3="model_name,fireworks_api_key"      #model_name="accounts/fireworks/models/deepseek-v3"
+MAX_TOKEN_CHUNK_SIZE=2000 #Max token used to process/extract the file content.
diff --git a/backend/README.md b/backend/README.md
@@ -18,7 +18,7 @@ Follow these steps to set up and run the project locally:
 
 2. Install Dependency :
 
-> pip install -t requirements.txt
+> pip install -r requirements.txt
 
 ## Run backend project using unicorn
 Run the server:

diff --git a/backend/constraints.txt b/backend/constraints.txt
@@ -1,4 +1,2 @@
--f https://download.pytorch.org/whl/torch_stable.html 
-torch==2.3.1+cpu
-torchvision==0.18.1+cpu
+-f https://download.pytorch.org/whl/torch_stable.html
 torchaudio==2.3.1+cpu
diff --git a/backend/dbtest.py b/backend/dbtest.py
@@ -3,18 +3,18 @@
 
 # Database configurations
 neo4j_configurations = [
-    {
-        'name': 'Neo4j Config 1',
-        'NEO4J_URI': 'neo4j+s://73b760b4.databases.neo4j.io',
-        'NEO4J_USERNAME': 'neo4j',
-        'NEO4J_PASSWORD': 'HqwAzfG83XwcEQ-mvEG4yNpcRTHMpsgZaYW3qIGJh2I'
-    },
     # {
-    #     'name': 'Neo4j Config 2',
-    #     'uri': 'bolt://another-host:7687',
-    #     'user': 'neo4j',
-    #     'password': 'password2'
-    # }
+    #     'name': 'Neo4j Config 1',
+    #     'NEO4J_URI': 'neo4j+s://73b760b4.databases.neo4j.io',
+    #     'NEO4J_USERNAME': 'neo4j',
+    #     'NEO4J_PASSWORD': 'HqwAzfG83XwcEQ-mvEG4yNpcRTHMpsgZaYW3qIGJh2I'
+    # },
+    {
+        'name': 'Neo4j Config 2',
+        'uri': 'bolt://localhost:7687',
+        'user': 'neo4j',
+        'password': 'test1234'
+    }
 ]
 
 # Function to create a Neo4j driver

diff --git a/backend/requirements.txt b/backend/requirements.txt
@@ -62,3 +62,7 @@ rouge_score==0.1.2
 langchain-neo4j==0.4.0
 pypandoc-binary==1.15
 chardet==5.2.0
+torch==2.7.0
+torchvision==0.22.0
+protobuf==5.29.4
+
diff --git a/backend/src/make_relationships.py b/backend/src/make_relationships.py
@@ -8,6 +8,7 @@
 import hashlib
 import time
 from langchain_neo4j import Neo4jVector
+import re
 
 logging.basicConfig(format='%(asctime)s - %(message)s',level='INFO')
 
@@ -16,14 +17,16 @@
 
 def merge_relationship_between_chunk_and_entites(graph: Neo4jGraph, graph_documents_chunk_chunk_Id : list):
     batch_data = []
-    logging.info("Create HAS_ENTITY relationship between chunks and entities")
+    logging.info("Create HAS_ENTITY relationship between chunks and entities (with semantic normalization)")
 
     for graph_doc_chunk_id in graph_documents_chunk_chunk_Id:
         for node in graph_doc_chunk_id['graph_doc'].nodes:
+            # Use normalization and semantic linking
+            canonical_id = create_or_link_entity_node(graph, node.id, node.type)
             query_data={
                 'chunk_id': graph_doc_chunk_id['chunk_id'],
                 'node_type': node.type,
-                'node_id': node.id
+                'node_id': canonical_id
             }
             batch_data.append(query_data)
 
@@ -176,4 +179,43 @@ def create_chunk_vector_index(graph):
         if ("EquivalentSchemaRuleAlreadyExists" in str(e) or "An equivalent index already exists" in str(e)):
             logging.info("Vector index already exists, skipping creation.")
         else:
-            raise
+            raise
+
+def normalize_entity_name(name: str) -> str:
+    """Normalizes entity names (removes articles, lowercases, trims whitespace)."""
+    name = name.strip().lower()
+    # Remove German and English articles at the beginning
+    name = re.sub(r'^(der|die|das|the|ein|eine|a|an) ', '', name)
+    # Optional: singularization, synonym replacement etc. (placeholder)
+    return name
+
+
+def create_or_link_entity_node(graph: Neo4jGraph, entity_name: str, entity_type: str = "Entity"):
+    """
+    Searches for existing entities with normalized name. If found, creates a :DERIVATIVE_OF relationship.
+    If not found, creates a new node.
+    Returns the node ID of the canonical node.
+    """
+    norm_name = normalize_entity_name(entity_name)
+    # Search for existing node with normalized name
+    query = (
+        "MATCH (e:Entity) WHERE toLower(e.id) = $norm_name RETURN e LIMIT 1"
+    )
+    result = graph.query(query, {"norm_name": norm_name})
+    if result:
+        # Node exists, create DERIVATIVE_OF relationship if name differs
+        node_id = result[0]["e"].get("id")
+        if entity_name.strip().lower() != norm_name:
+            rel_query = (
+                "MATCH (orig:Entity {id: $orig_name}), (norm:Entity {id: $norm_name}) "
+                "MERGE (orig)-[:DERIVATIVE_OF]->(norm)"
+            )
+            graph.query(rel_query, {"orig_name": entity_name, "norm_name": norm_name})
+        return node_id
+    else:
+        # Node does not exist, create new one
+        create_query = (
+            "CREATE (e:Entity {id: $norm_name, original: $orig_name, type: $entity_type}) RETURN e.id AS id"
+        )
+        res = graph.query(create_query, {"norm_name": norm_name, "orig_name": entity_name, "entity_type": entity_type})
+        return res[0]["id"] if res else None
diff --git a/backend/src/post_processing.py b/backend/src/post_processing.py
@@ -234,3 +234,60 @@ def graph_schema_consolidation(graph):
         execute_graph_query(graph,query)
 
     return None
+
+def link_entities_with_context_and_physical_sync(graph):
+    """
+    Links entities from chunks with the overall context and synchronizes physical entities.
+    Example: Checks if entities like 'Rewe' and 'Supermarket' are physically identical (e.g., same address) and creates SAME_AS relationships.
+    """
+    # 1. Retrieve all entities from chunks
+    query = '''
+        MATCH (c:Chunk)-[:CONTAINS]->(e)
+        RETURN DISTINCT e.id AS entity_id, e.name AS name, e.type AS type, e.address AS address, elementId(e) AS eid
+    '''
+    entities = execute_graph_query(graph, query)
+
+    # 2. Map potential duplicates by address
+    entity_map = {}
+    for ent in entities:
+        key = (ent.get('address') or '').strip().lower()
+        if key:
+            if key not in entity_map:
+                entity_map[key] = []
+            entity_map[key].append(ent)
+
+    # 3. For each group with the same address, create SAME_AS relationships
+    for addr, ents in entity_map.items():
+        if len(ents) > 1:
+            for i in range(len(ents)):
+                for j in range(i+1, len(ents)):
+                    eid1 = ents[i]['eid']
+                    eid2 = ents[j]['eid']
+                    sameas_query = f"""
+                        MATCH (a), (b)
+                        WHERE elementId(a) = '{eid1}' AND elementId(b) = '{eid2}'
+                        MERGE (a)-[:SAME_AS]->(b)
+                    """
+                    execute_graph_query(graph, sameas_query)
+    # 4. Additional rule: If name and type are identical but address is missing, also link as SAME_AS
+    name_type_map = {}
+    for ent in entities:
+        if not (ent.get('address') or '').strip():
+            key = (ent.get('name', '').strip().lower(), ent.get('type', '').strip().lower())
+            if all(key):
+                if key not in name_type_map:
+                    name_type_map[key] = []
+                name_type_map[key].append(ent)
+    for key, ents in name_type_map.items():
+        if len(ents) > 1:
+            for i in range(len(ents)):
+                for j in range(i+1, len(ents)):
+                    eid1 = ents[i]['eid']
+                    eid2 = ents[j]['eid']
+                    sameas_query = f"""
+                        MATCH (a), (b)
+                        WHERE elementId(a) = '{eid1}' AND elementId(b) = '{eid2}'
+                        MERGE (a)-[:SAME_AS]->(b)
+                    """
+                    execute_graph_query(graph, sameas_query)
+    return True
diff --git a/data/data_Football_news.pdf b/data/data_Football_news.pdf
diff --git a/frontend/.env b/frontend/.env
@@ -0,0 +1,20 @@
+VITE_BACKEND_API_URL="http://localhost:8000"
+VITE_BLOOM_URL="https://workspace-preview.neo4j.io/workspace/explore?connectURL={CONNECT_URL}&search=Show+me+a+graph&featureGenAISuggestions=true&featureGenAISuggestionsInternal=true"
+VITE_REACT_APP_SOURCES="local,youtube,wiki,s3,web"
+VITE_LLM_MODELS="diffbot,openai_gpt_3.5,openai_gpt_4o"
+VITE_ENV="DEV"
+VITE_TIME_PER_PAGE=50
+VITE_CHUNK_SIZE=5242880
+VITE_CHUNK_OVERLAP=20
+VITE_TOKENS_PER_CHUNK=100
+VITE_CHUNK_TO_COMBINE=1
+VITE_LARGE_FILE_SIZE=5242880
+VITE_GOOGLE_CLIENT_ID=""
+VITE_CHAT_MODES=""
+VITE_BATCH_SIZE=2
+VITE_LLM_MODELS_PROD="openai_gpt_4o,openai_gpt_4o_mini,diffbot,gemini_1.5_flash"
+VITE_FRONTEND_HOSTNAME="localhost:8080"
+VITE_SEGMENT_API_URL=""
+VITE_AUTH0_CLIENT_ID=""
+VITE_AUTH0_DOMAIN=""
+VITE_SKIP_AUTH=true