Skip to content

Commit cfa9f70

Browse files
committed
Create sub directories for each anomaly detected code unit
1 parent 9ee47df commit cfa9f70

File tree

4 files changed

+68
-58
lines changed

4 files changed

+68
-58
lines changed

domains/anomaly-detection/anomalyDetectionCsv.sh

Lines changed: 21 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -76,18 +76,22 @@ anomaly_detection_queries() {
7676

7777
local language
7878
language=$( extractQueryParameter "projection_language" "${@}" )
79-
79+
80+
# Within the absolute (full) report directory for anomaly detection, create a sub directory for every detailed type (Java_Package, Java_Type,...)
81+
local detail_report_directory="${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}"
82+
mkdir -p "${detail_report_directory}"
83+
8084
echo "anomalyDetectionCsv: $(date +'%Y-%m-%dT%H:%M:%S%z') Executing Queries for ${nodeLabel} nodes..."
81-
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPotentialImbalancedRoles.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_PotentialImbalancedRoles.csv"
82-
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPotentialOverEngineerOrIsolated.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_PotentialOverEngineerOrIsolated.csv"
85+
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPotentialImbalancedRoles.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_PotentialImbalancedRoles.csv"
86+
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPotentialOverEngineerOrIsolated.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_PotentialOverEngineerOrIsolated.csv"
8387

84-
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionHiddenBridgeNodes.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_HiddenBridgeNodes.csv"
85-
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPopularBottlenecks.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_PopularBottlenecks.csv"
86-
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionSilentCoordinators.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_SilentCoordinators.csv"
87-
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionOverReferencesUtilities.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_OverReferencesUtilities.csv"
88-
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionFragileStructuralBridges.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_FragileStructuralBridges.csv"
89-
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionDependencyHungryOrchestrators.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_DependencyHungryOrchestrators.csv"
90-
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionUnexpectedCentralNodes.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_UnexpectedCentralNodes.csv"
88+
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionHiddenBridgeNodes.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_HiddenBridgeNodes.csv"
89+
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPopularBottlenecks.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_PopularBottlenecks.csv"
90+
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionSilentCoordinators.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_SilentCoordinators.csv"
91+
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionOverReferencesUtilities.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_OverReferencesUtilities.csv"
92+
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionFragileStructuralBridges.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_FragileStructuralBridges.csv"
93+
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionDependencyHungryOrchestrators.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_DependencyHungryOrchestrators.csv"
94+
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionUnexpectedCentralNodes.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_UnexpectedCentralNodes.csv"
9195
}
9296

9397
# Label code units with top anomalies by archetype.
@@ -102,11 +106,15 @@ anomaly_detection_labels() {
102106
local language
103107
language=$( extractQueryParameter "projection_language" "${@}" )
104108

109+
# Within the absolute (full) report directory for anomaly detection, create a sub directory for every detailed type (Java_Package, Java_Type,...)
110+
local detail_report_directory="${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}"
111+
mkdir -p "${detail_report_directory}"
112+
105113
echo "anomalyDetectionCsv: $(date +'%Y-%m-%dT%H:%M:%S%z') Labelling ${language} ${nodeLabel} anomalies..."
106114
execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeRemoveLabels.cypher" "${@}"
107-
execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeAuthority.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyArchetypeTopAuthority.csv"
108-
execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeBottleneck.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyArchetypeTopBottleneck.csv"
109-
execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeHub.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyArchetypeTopHub.csv"
115+
execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeAuthority.cypher" "${@}" > "${detail_report_directory}/AnomalyArchetypeTopAuthority.csv"
116+
execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeBottleneck.cypher" "${@}" > "${detail_report_directory}/AnomalyArchetypeTopBottleneck.csv"
117+
execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeHub.cypher" "${@}" > "${detail_report_directory}/AnomalyArchetypeTopHub.csv"
110118
# The following two label types require Python scripts to run first and are skipped here intentionally:
111119
# execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeBridge.cypher" "${@}"
112120
# execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeOutlier.cypher" "${@}"

domains/anomaly-detection/anomalyDetectionFeaturePlots.py

Lines changed: 24 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ def __is_code_language_available(self) -> bool:
9898
def __get_projection_language(self) -> str:
9999
return self.query_parameters_["projection_language"] if self.__is_code_language_available() else ""
100100

101-
def get_plot_prefix(self) -> str:
101+
def get_title_prefix(self) -> str:
102102
if self.__is_code_language_available():
103103
return self.__get_projection_language() + " " + self.__get_projection_node_label()
104104
return self.__get_projection_node_label()
@@ -812,7 +812,7 @@ def get_common_plot_parameters(data: pd.DataFrame) -> dict:
812812
# ------------------------------------------------------------------------------------------------------------
813813

814814
parameters = parse_input_parameters()
815-
plot_prefix = parameters.get_plot_prefix()
815+
title_prefix = parameters.get_title_prefix()
816816
report_directory = parameters.get_report_directory()
817817

818818
driver = get_graph_database_driver()
@@ -825,31 +825,31 @@ def get_common_plot_parameters(data: pd.DataFrame) -> dict:
825825
data['pageRank'],
826826
data['articleRank'],
827827
data['shortCodeUnitName'],
828-
title=f"{plot_prefix} distribution of PageRank - ArticleRank differences",
829-
plot_file_path=get_file_path(f"{plot_prefix}_PageRank_Minus_ArticleRank_Distribution", parameters)
828+
title=f"{title_prefix} distribution of PageRank - ArticleRank differences",
829+
plot_file_path=get_file_path("PageRank_Minus_ArticleRank_Distribution", parameters)
830830
)
831831

832832
plot_clustering_coefficient_distribution(
833833
data['clusteringCoefficient'],
834-
title=f"{plot_prefix} distribution of clustering coefficients",
835-
plot_file_path=get_file_path(f"{plot_prefix}_ClusteringCoefficient_distribution", parameters)
834+
title=f"{title_prefix} distribution of clustering coefficients",
835+
plot_file_path=get_file_path("ClusteringCoefficient_distribution", parameters)
836836
)
837837

838838
plot_clustering_coefficient_vs_page_rank(
839839
data['clusteringCoefficient'],
840840
data['pageRank'],
841841
data['shortCodeUnitName'],
842842
data['clusterNoise'],
843-
title=f"{plot_prefix} clustering coefficient versus PageRank",
844-
plot_file_path=get_file_path(f"{plot_prefix}_ClusteringCoefficient_versus_PageRank", parameters)
843+
title=f"{title_prefix} clustering coefficient versus PageRank",
844+
plot_file_path=get_file_path("ClusteringCoefficient_versus_PageRank", parameters)
845845
)
846846

847847
if (overall_cluster_count < 20):
848848
print(f"anomalyDetectionFeaturePlots: Less than 20 clusters: {overall_cluster_count}. Only one plot containing all clusters will be created.")
849849
plot_clusters(
850850
clustering_visualization_dataframe=data,
851-
title=f"{plot_prefix} all clusters overall (less than 20)",
852-
plot_file_path=get_file_path(f"{plot_prefix}_Clusters_Overall", parameters)
851+
title=f"{title_prefix} all clusters overall (less than 20)",
852+
plot_file_path=get_file_path("Clusters_Overall", parameters)
853853
)
854854
else:
855855
print(f"anomalyDetectionFeaturePlots: More than 20 clusters: {overall_cluster_count}. Different plots focussing on different features like cluster size will be created.")
@@ -858,57 +858,57 @@ def get_common_plot_parameters(data: pd.DataFrame) -> dict:
858858
)
859859
plot_clusters(
860860
clustering_visualization_dataframe=clusters_by_largest_size,
861-
title=f"{plot_prefix} clusters with the largest size",
862-
plot_file_path=get_file_path(f"{plot_prefix}_Clusters_largest_size", parameters)
861+
title=f"{title_prefix} clusters with the largest size",
862+
plot_file_path=get_file_path("Clusters_largest_size", parameters)
863863
)
864864

865865
clusters_by_largest_max_radius = get_clusters_by_criteria(
866866
data, by='clusterRadiusMax', ascending=False, cluster_count=20
867867
)
868868
plot_clusters(
869869
clustering_visualization_dataframe=clusters_by_largest_max_radius,
870-
title=f"{plot_prefix} clusters with the largest max radius",
871-
plot_file_path=get_file_path(f"{plot_prefix}_Clusters_largest_max_radius", parameters)
870+
title=f"{title_prefix} clusters with the largest max radius",
871+
plot_file_path=get_file_path("Clusters_largest_max_radius", parameters)
872872
)
873873

874874
clusters_by_largest_average_radius = get_clusters_by_criteria(
875875
data, by='clusterRadiusAverage', ascending=False, cluster_count=20
876876
)
877877
plot_clusters(
878878
clustering_visualization_dataframe=clusters_by_largest_average_radius,
879-
title=f"{plot_prefix} clusters with the largest average radius",
880-
plot_file_path=get_file_path(f"{plot_prefix}_Clusters_largest_average_radius", parameters)
879+
title=f"{title_prefix} clusters with the largest average radius",
880+
plot_file_path=get_file_path("Clusters_largest_average_radius", parameters)
881881
)
882882

883883
plot_clusters_probabilities(
884884
clustering_visualization_dataframe=data,
885-
title=f"{plot_prefix} clustering probabilities (red=high uncertainty)",
886-
plot_file_path=get_file_path(f"{plot_prefix}_Cluster_probabilities", parameters)
885+
title=f"{title_prefix} clustering probabilities (red=high uncertainty)",
886+
plot_file_path=get_file_path("Cluster_probabilities", parameters)
887887
)
888888

889889
plot_cluster_noise(
890890
clustering_visualization_dataframe=data,
891-
title=f"{plot_prefix} clustering noise points that are surprisingly central (red) or popular (size)",
891+
title=f"{title_prefix} clustering noise points that are surprisingly central (red) or popular (size)",
892892
size_column_name='degree',
893893
color_column_name='pageRank',
894-
plot_file_path=get_file_path(f"{plot_prefix}_ClusterNoise_highly_central_and_popular", parameters)
894+
plot_file_path=get_file_path("ClusterNoise_highly_central_and_popular", parameters)
895895
)
896896

897897
plot_cluster_noise(
898898
clustering_visualization_dataframe=data,
899-
title=f"{plot_prefix} clustering noise points that bridge flow (red) and are poorly integrated (size)",
899+
title=f"{title_prefix} clustering noise points that bridge flow (red) and are poorly integrated (size)",
900900
size_column_name='inverseClusteringCoefficient',
901901
color_column_name='betweenness',
902-
plot_file_path=get_file_path(f"{plot_prefix}_ClusterNoise_poorly_integrated_bridges", parameters),
902+
plot_file_path=get_file_path("ClusterNoise_poorly_integrated_bridges", parameters),
903903
downscale_normal_sizes=0.4
904904
)
905905

906906
plot_cluster_noise(
907907
clustering_visualization_dataframe=data,
908-
title=f"{plot_prefix} clustering noise points with role inversion (size) possibly violating layering or dependency direction (red)",
908+
title=f"{title_prefix} clustering noise points with role inversion (size) possibly violating layering or dependency direction (red)",
909909
size_column_name='pageToArticleRankDifference',
910910
color_column_name='betweenness',
911-
plot_file_path=get_file_path(f"{plot_prefix}_ClusterNoise_role_inverted_bridges", parameters)
911+
plot_file_path=get_file_path("ClusterNoise_role_inverted_bridges", parameters)
912912
)
913913

914914
driver.close()

domains/anomaly-detection/anomalyDetectionPython.sh

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -128,18 +128,22 @@ anomaly_detection_using_python() {
128128

129129
echo "anomalyDetectionPipeline: $(date +'%Y-%m-%dT%H:%M:%S%z') Executing Python scripts for ${language} ${nodeLabel} nodes..."
130130

131+
# Within the absolute (full) report directory for anomaly detection, create a sub directory for every detailed type (Java_Package, Java_Type,...)
132+
local detail_report_directory="${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}"
133+
mkdir -p "${detail_report_directory}"
134+
131135
# Get tuned Leiden communities as a reference to tune clustering
132136
time "${ANOMALY_DETECTION_SCRIPT_DIR}/tunedLeidenCommunityDetection.py" "${@}" ${verboseMode}
133137
# Tuned Fast Random Projection and tuned HDBSCAN clustering
134138
time "${ANOMALY_DETECTION_SCRIPT_DIR}/tunedNodeEmbeddingClustering.py" "${@}" ${verboseMode}
135139
# Reduce the dimensionality of the node embeddings down to 2D for visualization using UMAP
136140
time "${ANOMALY_DETECTION_SCRIPT_DIR}/umap2dNodeEmbeddings.py" "${@}" ${verboseMode}
137141
# Plot the results with clustering and UMAP embeddings to reveal anomalies in rare feature combinations
138-
time "${ANOMALY_DETECTION_SCRIPT_DIR}/anomalyDetectionFeaturePlots.py" "${@}" "--report_directory" "${FULL_REPORT_DIRECTORY}" ${verboseMode}
142+
time "${ANOMALY_DETECTION_SCRIPT_DIR}/anomalyDetectionFeaturePlots.py" "${@}" "--report_directory" "${detail_report_directory}" ${verboseMode}
139143
# Run an unsupervised anomaly detection algorithm including tuning and explainability
140-
time "${ANOMALY_DETECTION_SCRIPT_DIR}/tunedAnomalyDetectionExplained.py" "${@}" "--report_directory" "${FULL_REPORT_DIRECTORY}" ${verboseMode}
144+
time "${ANOMALY_DETECTION_SCRIPT_DIR}/tunedAnomalyDetectionExplained.py" "${@}" "--report_directory" "${detail_report_directory}" ${verboseMode}
141145
# Query Results: Output all collected features into a CSV file.
142-
execute_cypher "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeatures.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}AnomalyDetection_Features.csv"
146+
execute_cypher "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeatures.cypher" "${@}" > "${detail_report_directory}/Anomaly_Features.csv"
143147
}
144148

145149
# Label code units with top anomalies by archetype.

0 commit comments

Comments
 (0)