Skip to content

Commit e4de99f

Browse files
committed
Introduce anomaly detection pipeline with tuned clustering
1 parent cc53ef2 commit e4de99f

13 files changed

+874
-354
lines changed
Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
#!/usr/bin/env bash
2+
3+
# Pipeline that coordinates anomaly detection using the Graph Data Science Library of Neo4j.
4+
# It requires an already running Neo4j graph database with already scanned and analyzed artifacts.
5+
# The results will be written into the sub directory reports/anomaly-detection.
6+
7+
# Note that "scripts/prepareAnalysis.sh" is required to run prior to this script.
8+
9+
# Requires executeQueryFunctions.sh, projectionFunctions.sh, cleanupAfterReportGeneration.sh
10+
11+
# Fail on any error ("-e" = exit on first error, "-o pipefail" exist on errors within piped commands)
12+
set -o errexit -o pipefail
13+
14+
# Overrideable Constants (defaults also defined in sub scripts)
15+
REPORTS_DIRECTORY=${REPORTS_DIRECTORY:-"reports"}
16+
17+
## Get this "scripts/reports" directory if not already set
18+
# Even if $BASH_SOURCE is made for Bourne-like shells it is also supported by others and therefore here the preferred solution.
19+
# CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes.
20+
# This way non-standard tools like readlink aren't needed.
21+
ANOMALY_DETECTION_SCRIPT_DIR=${ANOMALY_DETECTION_SCRIPT_DIR:-$(CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P)}
22+
echo "anomalyDetectionPipeline: ANOMALY_DETECTION_SCRIPT_DIR=${ANOMALY_DETECTION_SCRIPT_DIR}"
23+
# Get the "scripts" directory by taking the path of this script and going one directory up.
24+
SCRIPTS_DIR=${SCRIPTS_DIR:-"${ANOMALY_DETECTION_SCRIPT_DIR}/../../scripts"} # Repository directory containing the shell scripts
25+
# Get the "cypher" query directory for gathering features.
26+
ANOMALY_DETECTION_FEATURE_CYPHER_DIR=${ANOMALY_DETECTION_FEATURE_CYPHER_DIR:-"${ANOMALY_DETECTION_SCRIPT_DIR}/features"}
27+
28+
# Function to display script usage
29+
usage() {
30+
echo -e "${COLOR_ERROR}" >&2
31+
echo "Usage: $0 [--verbose]" >&2
32+
echo -e "${COLOR_DEFAULT}" >&2
33+
exit 1
34+
}
35+
36+
# Default values
37+
verboseMode="" # either "" or "--verbose"
38+
39+
# Parse command line arguments
40+
while [[ $# -gt 0 ]]; do
41+
key="$1"
42+
value="${2}"
43+
44+
case ${key} in
45+
--verbose)
46+
verboseMode="--verbose"
47+
;;
48+
*)
49+
echo -e "${COLOR_ERROR}anomalyDetectionPipeline: Error: Unknown option: ${key}${COLOR_DEFAULT}" >&2
50+
usage
51+
;;
52+
esac
53+
shift || true # ignore error when there are no more arguments
54+
done
55+
56+
# Define functions to execute a cypher query from within a given file (first and only argument) like "execute_cypher"
57+
source "${SCRIPTS_DIR}/executeQueryFunctions.sh"
58+
59+
# Define functions to create and delete Graph Projections like "createUndirectedDependencyProjection"
60+
source "${SCRIPTS_DIR}/projectionFunctions.sh"
61+
62+
# Create report directory
63+
REPORT_NAME="anomaly-detection"
64+
FULL_REPORT_DIRECTORY="${REPORTS_DIRECTORY}/${REPORT_NAME}"
65+
mkdir -p "${FULL_REPORT_DIRECTORY}"
66+
67+
# Query Parameter key pairs for projection and algorithm side
68+
PROJECTION_NAME="dependencies_projection"
69+
ALGORITHM_PROJECTION="projection_name"
70+
71+
PROJECTION_NODE="dependencies_projection_node"
72+
ALGORITHM_NODE="projection_node_label"
73+
74+
PROJECTION_WEIGHT="dependencies_projection_weight_property"
75+
ALGORITHM_WEIGHT="projection_weight_property"
76+
77+
# Code independent algorithm parameters
78+
COMMUNITY_PROPERTY="community_property=communityLeidenIdTuned"
79+
80+
# Run the anomaly detection pipeline.
81+
#
82+
# Required Parameters:
83+
# - projection_name=...
84+
# Name prefix for the in-memory projection name. Example: "package-anomaly-detection"
85+
# - projection_node_label=...
86+
# Label of the nodes that will be used for the projection. Example: "Package"
87+
# - projection_weight_property=...
88+
# Name of the node property that contains the dependency weight. Example: "weight"
89+
anomaly_detection_pipeline() {
90+
91+
# Query Feature: Determine the Betweenness centrality (with the directed graph projection) if not already done
92+
time execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-Betweenness-Exists.cypher" "${@}" \
93+
"${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-Betweenness-Write.cypher" "${@}" >/dev/null
94+
# Query Feature: Determine the local clustering coefficient if not already done
95+
time execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-LocalClusteringCoefficient-Exists.cypher" "${@}" \
96+
"${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-LocalClusteringCoefficient-Write.cypher" "${@}" >/dev/null
97+
# Query Feature: Determine the page rank if not already done
98+
time execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-PageRank-Exists.cypher" "${@}" \
99+
"${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-PageRank-Write.cypher" "${@}" >/dev/null
100+
# Query Feature: Determine the article rank if not already done
101+
time execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-ArticleRank-Exists.cypher" "${@}" \
102+
"${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-ArticleRank-Write.cypher" "${@}" >/dev/null
103+
104+
# Run Python: Get tuned Leiden communities as a reference to tune clustering
105+
time "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/tunedLeidenCommunityDetection.py" "${@}" ${verboseMode}
106+
# Run Python: Tuned Fast Random Projection and tuned HDBSCAN clustering
107+
time "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/tunedNodeEmbeddingClustering.py" "${@}" ${verboseMode}
108+
109+
# Query Results: Output all collected features into a CSV file.
110+
local nodeLabel
111+
nodeLabel=$( extractQueryParameter "projection_node_label" "${@}" )
112+
execute_cypher "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeatures.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${nodeLabel}AnomalyDetection.csv"
113+
}
114+
115+
# -- Java Artifact Node Embeddings -------------------------------
116+
117+
if createUndirectedDependencyProjection "${PROJECTION_NAME}=artifact-anomaly-detection" "${PROJECTION_NODE}=Artifact" "${PROJECTION_WEIGHT}=weight"; then
118+
createDirectedDependencyProjection "${PROJECTION_NAME}=artifact-anomaly-detection-directed" "${PROJECTION_NODE}=Artifact" "${PROJECTION_WEIGHT}=weight"
119+
anomaly_detection_pipeline "${ALGORITHM_PROJECTION}=artifact-anomaly-detection" "${ALGORITHM_NODE}=Artifact" "${ALGORITHM_WEIGHT}=weight" "${COMMUNITY_PROPERTY}"
120+
fi
121+
122+
# -- Java Package Node Embeddings --------------------------------
123+
124+
if createUndirectedDependencyProjection "${PROJECTION_NAME}=package-anomaly-detection" "${PROJECTION_NODE}=Package" "${PROJECTION_WEIGHT}=weight25PercentInterfaces"; then
125+
createDirectedDependencyProjection "${PROJECTION_NAME}=package-anomaly-detection-directed" "${PROJECTION_NODE}=Package" "${PROJECTION_WEIGHT}=weight25PercentInterfaces"
126+
anomaly_detection_pipeline "${ALGORITHM_PROJECTION}=package-anomaly-detection" "${ALGORITHM_NODE}=Package" "${ALGORITHM_WEIGHT}=weight25PercentInterfaces" "${COMMUNITY_PROPERTY}"
127+
fi
128+
129+
# -- Java Type Node Embeddings -----------------------------------
130+
131+
if createUndirectedJavaTypeDependencyProjection "${PROJECTION_NAME}=type-anomaly-detection"; then
132+
createDirectedJavaTypeDependencyProjection "${PROJECTION_NAME}=type-anomaly-detection-directed"
133+
anomaly_detection_pipeline "${ALGORITHM_PROJECTION}=type-anomaly-detection" "${ALGORITHM_NODE}=Type" "${ALGORITHM_WEIGHT}=weight" "${COMMUNITY_PROPERTY}"
134+
fi
135+
136+
# -- Typescript Module Node Embeddings ---------------------------
137+
138+
if createUndirectedDependencyProjection "${PROJECTION_NAME}=typescript-module-embedding" "${PROJECTION_NODE}=Module" "${PROJECTION_WEIGHT}=lowCouplingElement25PercentWeight"; then
139+
createDirectedDependencyProjection "${PROJECTION_NAME}=typescript-module-embedding-directed" "${PROJECTION_NODE}=Module" "${PROJECTION_WEIGHT}=lowCouplingElement25PercentWeight"
140+
anomaly_detection_pipeline "${ALGORITHM_PROJECTION}=typescript-module-embedding" "${ALGORITHM_NODE}=Module" "${ALGORITHM_WEIGHT}=lowCouplingElement25PercentWeight" "${COMMUNITY_PROPERTY}"
141+
fi
142+
143+
# ---------------------------------------------------------------
144+
145+
# Clean-up after report generation. Empty reports will be deleted.
146+
source "${SCRIPTS_DIR}/cleanupAfterReportGeneration.sh" "${FULL_REPORT_DIRECTORY}"
147+
148+
echo "anomalyDetectionPipeline: $(date +'%Y-%m-%dT%H:%M:%S%z') Successfully finished."
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
// Return the first node with a centralityArticleRank if it exists
2+
3+
MATCH (codeUnit)
4+
WHERE $projection_node_label IN labels(codeUnit)
5+
AND codeUnit.centralityArticleRank IS NOT NULL
6+
RETURN codeUnit.name AS shortCodeUnitName
7+
,elementId(codeUnit) AS nodeElementId
8+
,codeUnit.centralityArticleRank AS articleRank
9+
LIMIT 1
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
// Calculates and writes the Article Rank centrality score for anomaly detection
2+
3+
CALL gds.articleRank.write(
4+
$projection_name + '-cleaned', {
5+
maxIterations: 50
6+
,relationshipWeightProperty: $projection_weight_property
7+
,writeProperty: 'centralityArticleRank'
8+
})
9+
YIELD nodePropertiesWritten, ranIterations, didConverge, preProcessingMillis, computeMillis, postProcessingMillis, writeMillis
10+
RETURN nodePropertiesWritten, ranIterations, didConverge, preProcessingMillis, computeMillis, postProcessingMillis, writeMillis
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
// Return the first node with a centralityBetweenness if it exists
2+
3+
MATCH (codeUnit)
4+
WHERE $projection_node_label IN labels(codeUnit)
5+
AND codeUnit.centralityBetweenness IS NOT NULL
6+
RETURN codeUnit.name AS shortCodeUnitName
7+
,elementId(codeUnit) AS nodeElementId
8+
,codeUnit.centralityBetweenness AS pageRank
9+
LIMIT 1
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
// Calculates and writes the Betweeness centrality score for anomaly detection
2+
3+
CALL gds.betweenness.write(
4+
$dependencies_projection + '-directed-cleaned', {
5+
relationshipWeightProperty: $projection_weight_property
6+
,writeProperty: 'centralityBetweenness'
7+
})
8+
YIELD nodePropertiesWritten, preProcessingMillis, computeMillis, postProcessingMillis, writeMillis
9+
RETURN nodePropertiesWritten, preProcessingMillis, computeMillis, postProcessingMillis, writeMillis
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
// Return the first node with a clusteringCoefficient if it exists
2+
3+
MATCH (codeUnit)
4+
WHERE $projection_node_label IN labels(codeUnit)
5+
AND codeUnit.communityLocalClusteringCoefficient IS NOT NULL
6+
RETURN codeUnit.name AS shortCodeUnitName
7+
,elementId(codeUnit) AS nodeElementId
8+
,codeUnit.communityLocalClusteringCoefficient AS clusteringCoefficient
9+
LIMIT 1
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
// Calculates and writes the local clustering coefficient for anomaly detection
2+
3+
CALL gds.localClusteringCoefficient.write(
4+
$projection_name + '-cleaned', {
5+
writeProperty: 'communityLocalClusteringCoefficient'
6+
})
7+
YIELD averageClusteringCoefficient, nodeCount, nodePropertiesWritten, preProcessingMillis, computeMillis, postProcessingMillis, writeMillis
8+
RETURN averageClusteringCoefficient, nodeCount, nodePropertiesWritten, preProcessingMillis, computeMillis, postProcessingMillis, writeMillis
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
// Return the first node with a centralityPageRank if it exists
2+
3+
MATCH (codeUnit)
4+
WHERE $projection_node_label IN labels(codeUnit)
5+
AND codeUnit.centralityPageRank IS NOT NULL
6+
RETURN codeUnit.name AS shortCodeUnitName
7+
,elementId(codeUnit) AS nodeElementId
8+
,codeUnit.centralityPageRank AS pageRank
9+
LIMIT 1
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
// Calculates and writes the Article Rank centrality score for anomaly detection
2+
3+
CALL gds.pageRank.write(
4+
$projection_name + '-cleaned', {
5+
maxIterations: 50
6+
,relationshipWeightProperty: $projection_weight_property
7+
,writeProperty: 'centralityPageRank'
8+
})
9+
YIELD nodePropertiesWritten, ranIterations, didConverge, preProcessingMillis, computeMillis, postProcessingMillis, writeMillis
10+
RETURN nodePropertiesWritten, ranIterations, didConverge, preProcessingMillis, computeMillis, postProcessingMillis, writeMillis
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
// Query code unit nodes with their anomaly detection
2+
3+
MATCH (codeUnit)
4+
WHERE $projection_node_label IN labels(codeUnit)
5+
AND codeUnit[$community_property] IS NOT NULL
6+
AND codeUnit.incomingDependencies IS NOT NULL
7+
AND codeUnit.outgoingDependencies IS NOT NULL
8+
AND codeUnit.communityLocalClusteringCoefficient IS NOT NULL
9+
AND codeUnit.centralityArticleRank IS NOT NULL
10+
AND codeUnit.centralityPageRank IS NOT NULL
11+
AND codeUnit.centralityBetweenness IS NOT NULL
12+
AND codeUnit.clusteringHDBSCANLabel IS NOT NULL
13+
AND codeUnit.clusteringHDBSCANProbability IS NOT NULL
14+
AND codeUnit.clusteringHDBSCANNoise IS NOT NULL
15+
AND codeUnit.embeddingFastRandomProjectionVisualizationX IS NOT NULL
16+
AND codeUnit.embeddingFastRandomProjectionVisualizationY IS NOT NULL
17+
OPTIONAL MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit)
18+
WITH *, artifact.name AS artifactName
19+
OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS]->(codeUnit)
20+
WITH *, last(split(projectRoot.absoluteFileName, '/')) AS projectName
21+
RETURN DISTINCT
22+
coalesce(codeUnit.fqn, codeUnit.globalFqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName
23+
,codeUnit.name AS shortCodeUnitName
24+
,elementId(codeUnit) AS nodeElementId
25+
,coalesce(artifactName, projectName) AS projectName
26+
,codeUnit.incomingDependencies AS incomingDependencies
27+
,codeUnit.outgoingDependencies AS outgoingDependencies
28+
,codeUnit[$community_property] AS communityId
29+
,codeUnit.communityLocalClusteringCoefficient AS clusteringCoefficient
30+
,codeUnit.centralityArticleRank AS articleRank
31+
,codeUnit.centralityPageRank AS pageRank
32+
,codeUnit.centralityBetweenness AS betweenness
33+
,codeUnit.clusteringHDBSCANLabel AS clusteringLabel
34+
,codeUnit.clusteringHDBSCANProbability AS clusteringProbability
35+
,codeUnit.clusteringHDBSCANNoise AS clusteringIsNoise
36+
,codeUnit.embeddingFastRandomProjectionVisualizationX AS visualizationX
37+
,codeUnit.embeddingFastRandomProjectionVisualizationY AS visualizationY
38+
,coalesce(codeUnit.centralityPageRank, 0.00001) AS centrality

0 commit comments

Comments
 (0)