LabKey · vagisha · Feb 27, 2025 · Feb 15, 2025 · Feb 19, 2025 · Feb 20, 2025
diff --git a/panoramapublic/src/org/labkey/panoramapublic/model/validation/SpecLibSourceFile.java b/panoramapublic/src/org/labkey/panoramapublic/model/validation/SpecLibSourceFile.java
@@ -1,5 +1,10 @@
 package org.labkey.panoramapublic.model.validation;
 
+import org.jetbrains.annotations.NotNull;
+import org.json.JSONObject;
+import org.labkey.api.data.Container;
+import org.labkey.panoramapublic.speclib.LibSourceFile;
+
 import java.util.Objects;
 
 // For table panoramapublic.speclibsourcefile
@@ -78,4 +83,16 @@ public int hashCode()
     {
         return Objects.hash(getSourceType(), getName());
     }
+
+    @NotNull
+    public JSONObject toJSON(Container container)
+    {
+        JSONObject jsonObject = super.toJSON(container);
+        if (isIdFile() && LibSourceFile.DIANN_REPORT_TSV_PLACEHOLDER.equals(getName()) && !found())
+        {
+            jsonObject.put("statusDetails", "The DIA-NN TSV report must be in the same directory as the " +
+                    ".speclib, and share some leading characters in the file name");
+        }
+        return jsonObject;
+    }
 }
diff --git a/panoramapublic/src/org/labkey/panoramapublic/proteomexchange/validator/SpecLibValidator.java b/panoramapublic/src/org/labkey/panoramapublic/proteomexchange/validator/SpecLibValidator.java
@@ -171,18 +171,40 @@ private static List<LibSourceFile> getLibSources(SpecLibReader libReader, ISpect
         {
             throw UnexpectedException.wrap(e, "Error reading source files from library file " + libFilePath.toString());
         }
-        if (sourceFiles != null && sourceFiles.stream().anyMatch(LibSourceFile::isMaxQuantSearch))
+
+        if (sourceFiles == null) return null;
+
+        if (sourceFiles.stream().anyMatch(LibSourceFile::isMaxQuantSearch))
         {
             // For libraries built with MaxQuant search results we need to add additional files that are required for library building
             Set<String> idFileNames = sourceFiles.stream().filter(LibSourceFile::hasIdFile).map(LibSourceFile::getIdFile).collect(Collectors.toSet());
-            for (String file: LibSourceFile.MAX_QUANT_ID_FILES)
+            for (String file : LibSourceFile.MAX_QUANT_ID_FILES)
             {
                 if (!idFileNames.contains(file))
                 {
                     sourceFiles.add(new LibSourceFile(null, file, null));
                 }
             }
         }
+        else if (sourceFiles.stream().anyMatch(LibSourceFile::isDiannSearch))
+        {
+            // Building a library with DIA-NN results in Skyline requires a .speclib file and a report TSV file.
+            // The .blib file includes the name of .speclib but not the name of the report TSV file.
+            // Building a library without the TSV gives this error message in Skyline:
+            // "...the TSV report is required to read speclib files and must be in the same directory as the speclib
+            // and share some leading characters (e.g. somedata-tsv.speclib and somedata-report.tsv)..."
+
+            // At some point Skyline may start including the names of all source files in the .blib SQLite file,
+            // so first check if any TSV files were listed as sources in the .blib
+            boolean hasTsvFiles = sourceFiles.stream()
+                    .anyMatch(file -> file.hasIdFile() && file.getIdFile().toLowerCase().endsWith(".tsv"));
+            if (!hasTsvFiles)
+            {
+                // If there is no TSV source listed in the .blib, then add a placeholder for the DIA-NN report file.
+                sourceFiles.add(new LibSourceFile(null, LibSourceFile.DIANN_REPORT_TSV_PLACEHOLDER, null));
+            }
+        }
+
         return sourceFiles;
     }
 
@@ -241,12 +263,29 @@ private void validateLibrarySources(List<LibSourceFile> sources, FileContentServ
             String idFile = source.getIdFile();
             if (source.hasIdFile() && !checkedFiles.contains(idFile))
             {
+                if (LibSourceFile.DIANN_REPORT_TSV_PLACEHOLDER.equals(idFile)) continue; // We will look for this when we come to the .speclib file
+
                 checkedFiles.add(idFile);
                 Path path = getPath(idFile, rawFilesDirPaths, false, fcs);
                 SpecLibSourceFile sourceFile = new SpecLibSourceFile(idFile, PEPTIDE_ID);
                 sourceFile.setSpecLibValidationId(getId());
                 sourceFile.setPath(path != null ? path.toString() : DataFile.NOT_FOUND);
                 idFiles.add(sourceFile);
+
+                if (source.isDiannSearch())
+                {
+                    // If this is a DIA-NN .speclib file, check for the required report TSV file.
+                    // We are doing this because the .blib does not include the name of the report TSV file.
+                    // We only know that: "the TSV report is required to read speclib files and must be in the
+                    // same directory as the speclib and share some leading characters
+                    // (e.g. somedata-tsv.speclib and somedata-report.tsv)"
+                    Path reportFilePath = sourceFile.found() ? getDiannReportFilePath(path) : null;
+                    SpecLibSourceFile diannReportSourceFile = new SpecLibSourceFile(LibSourceFile.DIANN_REPORT_TSV_PLACEHOLDER, PEPTIDE_ID);
+                    diannReportSourceFile.setSpecLibValidationId(getId());
+                    diannReportSourceFile.setPath(reportFilePath != null ? reportFilePath.toString() : DataFile.NOT_FOUND);
+                    idFiles.add(diannReportSourceFile);
+                    checkedFiles.add(idFile);
+                }
             }
         }
         setSpectrumFiles(spectrumFiles);
@@ -266,6 +305,77 @@ private Path getPath(String name, Set<Path> rawFilesDirPaths, boolean isMaxquant
         return null;
     }
 
+    private static Path getDiannReportFilePath(Path speclibFilePath)
+    {
+        Path specLibFileDir = speclibFilePath.getParent();
+        try (Stream<Path> paths = Files.list(specLibFileDir))
+        {
+            List<Path> files = paths.filter(path -> Files.isRegularFile(path)).collect(Collectors.toList());
+            return getDiannReportFilePath(speclibFilePath.getFileName().toString(), files);
+        }
+        catch (IOException e)
+        {
+            throw UnexpectedException.wrap(e, "Error looking for DIA-NN report TSV file in " + specLibFileDir);
+        }
+    }
+
+    private static Path getDiannReportFilePath(String specLibFileName, List<Path> candidateFiles)
+    {
+        Map<Path, Integer> prefixLengthMap = getCommonPrefixLengthsForTsvFiles(candidateFiles, specLibFileName);
+
+        // Find the TSV file with the longest common prefix that also has the expected column headers in the first line
+        return prefixLengthMap.entrySet().stream()
+                .sorted((entry1, entry2) -> Integer.compare(entry2.getValue(), entry1.getValue())) // Sort descending by matching prefix length
+                .map(Map.Entry::getKey)  // File paths
+                .filter(file -> hasRequiredHeaders(file)) // First line should have expected header columns
+                .findFirst() // Get the first file that meets the conditions
+                .orElse(null);
+    }
+
+    private static Map<Path, Integer> getCommonPrefixLengthsForTsvFiles(List<Path> files, String specLibFileName)
+    {
+        String specLibFileBaseName = FileUtil.getBaseName(specLibFileName); // Remove file extension
+        Map<Path, Integer> prefixLengthMap = new HashMap<>();
+        files.stream()
+                .filter(file -> file.getFileName().toString().toLowerCase().endsWith(".tsv")) // Ensure it's a TSV file
+                .forEach(file -> {
+                    // Get the longest common prefix length
+                    int commonPrefixLength = commonPrefixLength(specLibFileBaseName, FileUtil.getBaseName(file.getFileName().toString()));
+
+                    if (commonPrefixLength > 0)
+                    {
+                        prefixLengthMap.put(file, commonPrefixLength);
+                    }
+                });
+        return prefixLengthMap;
+    }
+
+    private static int commonPrefixLength(String s1, String s2)
+    {
+        int maxLength = Math.min(s1.length(), s2.length());
+        int index = 0;
+        while (index < maxLength && s1.charAt(index) == s2.charAt(index))
+        {
+            index++;
+        }
+        return index;
+    }
+
+    private static boolean hasRequiredHeaders(Path diannReportTsv)
+    {
+        try
+        {
+            // Read the first line of the file
+            String firstLine = Files.lines(diannReportTsv).findFirst().orElse("");
+            // Check if the first line has the expected header columns names
+            return List.of(firstLine.trim().split("\t")).containsAll(LibSourceFile.DIANN_REPORT_EXPECTED_HEADERS);
+        }
+        catch (IOException e)
+        {
+            throw UnexpectedException.wrap(e, "Error reading the first line of TSV file " + diannReportTsv);
+        }
+    }
+
     private Path findInDirectoryTree(java.nio.file.Path rawFilesDirPath, String fileName, boolean allowBaseName)
     {
         try
@@ -459,6 +569,116 @@ public void testAccept()
             assertTrue(accept("170428_DBS_cal_7a.d", "170428_DBS_cal_7a.d.zip"));
         }
 
+        @Test
+        public void testCommonPrefixLength() throws IOException
+        {
+            Path testDataDir = getDiannTestFilesPath();
+
+            // The spec lib file name to compare against
+            String specLibFileName = "report-lib.parquet.skyline-for-test.speclib";
+
+            Path tsvFile1 = testDataDir.resolve("report-lib.tsv");
+            Path tsvFile2 = testDataDir.resolve("report-lib-for-test.tsv");
+            Path tsvFile3 = testDataDir.resolve("report-lib.parquet.tsv");
+            Path tsvFile4 = testDataDir.resolve("report-lib.parquet-test.tsv");
+            Path tsvFile5 = testDataDir.resolve("no-prefix-match-report.tsv");
+            Path nonTsvFile1 = testDataDir.resolve("report-lib.parquet.skyline-for-test.txt");
+            Path nonTsvFile2 = testDataDir.resolve("report.txt");
+            Path nonTsvFile3 = testDataDir.resolve(specLibFileName);
+
+            List<Path> files = List.of(tsvFile1, tsvFile2, tsvFile3, tsvFile4, tsvFile5, nonTsvFile1, nonTsvFile2, nonTsvFile3);
+
+            Map<Path, Integer> prefixLengthMap = SpecLibValidator.getCommonPrefixLengthsForTsvFiles(files, specLibFileName);
+            // Expect 4 TSV files in the list; files without a prefix match, and non-TSV files should be ignored.
+            assertEquals("Unexpected size of prefixLengthMap", 4, prefixLengthMap.size());
+
+            // File report-lib.tsv should have a common prefix "report-lib"
+            assertTrue(prefixLengthMap.containsKey(tsvFile1));
+            assertEquals("report-lib".length(), prefixLengthMap.get(tsvFile1).intValue());
+
+            // File report-lib-test.tsv should have a common prefix "report-lib"
+            assertTrue(prefixLengthMap.containsKey(tsvFile2));
+            assertEquals("report-lib".length(), prefixLengthMap.get(tsvFile2).intValue());
+
+            // File report-lib.parquet.tsv should have a common prefix "report-lib.parquet"
+            assertTrue(prefixLengthMap.containsKey(tsvFile3));
+            assertEquals("report-lib.parquet".length(), prefixLengthMap.get(tsvFile3).intValue());
+
+            // File report-lib.parquet-test.tsv should have a common prefix "report-lib.parquet"
+            assertTrue(prefixLengthMap.containsKey(tsvFile4));
+            assertEquals("report-lib.parquet".length(), prefixLengthMap.get(tsvFile4).intValue());
+
+            // File no-prefix-match-report.tsv should not have a common prefix
+            assertFalse(tsvFile5 + " does not share a prefix with " + specLibFileName, prefixLengthMap.containsKey(tsvFile5));
+
+            assertFalse(prefixLengthMap.containsKey(nonTsvFile1));
+            assertFalse(prefixLengthMap.containsKey(nonTsvFile2));
+            assertFalse(prefixLengthMap.containsKey(nonTsvFile3));
+
+            // List of files that do not share a common prefix with the speclib file
+            files = List.of(testDataDir.resolve("abcd.tsv"), testDataDir.resolve("1234.tsv"), testDataDir.resolve("lib.parquet.skyline.tsv"));
+            prefixLengthMap = SpecLibValidator.getCommonPrefixLengthsForTsvFiles(files, specLibFileName);
+            assertEquals(0, prefixLengthMap.size());
+
+            prefixLengthMap = SpecLibValidator.getCommonPrefixLengthsForTsvFiles(files, specLibFileName);
+            assertEquals(0, prefixLengthMap.size());
+        }
+
+        @Test
+        public void testGetDiannReportFilePath() throws IOException
+        {
+            Path testDataDir = getDiannTestFilesPath();
+            String specLibFileName = "report-lib.parquet.skyline-for-test.speclib";
+
+            Path reportTsvFile = SpecLibValidator.getDiannReportFilePath(specLibFileName, Collections.emptyList());
+            assertNull("Unexpected report TSV file path returned. Input file list is empty.", reportTsvFile);
+
+            // TSV Files in the test directory
+            Path tsvFile1 = testDataDir.resolve("report.tsv");
+            Path tsvFile2 = testDataDir.resolve("report-lib-for-test.tsv");
+            Path tsvFile3 = testDataDir.resolve("no-prefix-match-report-for-test.tsv");
+            Path tsvFile4 = testDataDir.resolve("report-lib.parquet-missing-headers.txt");
+            // Non-TSV files in the test directory
+            Path nonTsvFile1 = testDataDir.resolve("report.txt");
+            Path nonTsvFile2 = testDataDir.resolve("report-lib.parquet.skyline-for-test.txt");
+            Path nonTsvFile3 = testDataDir.resolve(specLibFileName);
+            Path nonTsvFile4 = testDataDir.resolve("test_diann_library.blib");
+
+            List<Path> candidateFiles = new ArrayList<>();
+            candidateFiles.add(nonTsvFile1);
+            candidateFiles.add(nonTsvFile2);
+            candidateFiles.add(nonTsvFile3);
+            candidateFiles.add(nonTsvFile4);
+
+            assertNull("Unexpected report TSV file path returned. Input list does not have any TSV files",
+                    SpecLibValidator.getDiannReportFilePath(specLibFileName, candidateFiles));
+
+            candidateFiles.add(tsvFile3); // TSV file does not share a prefix with the speclib file
+            assertNull("Unexpected report TSV file path returned. Input list does not have any TSV files that share a prefix with the speclib file",
+                    SpecLibValidator.getDiannReportFilePath(specLibFileName, candidateFiles));
+
+            candidateFiles.add(tsvFile4); // TSV file does not have the required column headers
+            assertNull("Unexpected report TSV file path returned. Input list does not have any TSV files that share a prefix with the speclib file" +
+                            " and have the required column headers",
+                    SpecLibValidator.getDiannReportFilePath(specLibFileName, candidateFiles));
+
+            candidateFiles.add(tsvFile1); // Shares a prefix and has the required column headers
+            reportTsvFile = SpecLibValidator.getDiannReportFilePath(specLibFileName, candidateFiles);
+            assertNotNull(reportTsvFile);
+            assertEquals(tsvFile1, reportTsvFile);
+
+            candidateFiles.add(tsvFile2); // Shares a longer prefix with the speclib file
+            reportTsvFile = SpecLibValidator.getDiannReportFilePath(specLibFileName, candidateFiles);
+            assertNotNull(reportTsvFile);
+            assertEquals(tsvFile2, reportTsvFile);
+        }
+
+        private static Path getDiannTestFilesPath() throws IOException
+        {
+            return JunitUtil.getSampleData(ModuleLoader.getInstance().getModule(PanoramaPublicModule.class),
+                    "TargetedMS/panoramapublic/LibraryTest-DiaNN").toPath();
+        }
+
         private ISpectrumLibrary createLibrary(Path path)
         {
             return new ISpectrumLibrary()

diff --git a/panoramapublic/src/org/labkey/panoramapublic/speclib/LibSourceFile.java b/panoramapublic/src/org/labkey/panoramapublic/speclib/LibSourceFile.java
@@ -81,4 +81,14 @@ public boolean isMaxQuantSearch()
     {
         return (hasIdFile() && getIdFile().endsWith("msms.txt")) || containsScoreType("MAXQUANT SCORE");
     }
+
+    public static String DIANN_REPORT_TSV_PLACEHOLDER = "DIA-NN report file";
+
+    // These are some of the column headers that we expect to see in a DIA-NN report TSV file
+    public static List<String> DIANN_REPORT_EXPECTED_HEADERS = List.of("File.Name", "Run", "Protein.Group", "Protein.Ids", "Protein.Names");
+
+    public boolean isDiannSearch()
+    {
+        return (hasIdFile() && getIdFile().toLowerCase().endsWith(".speclib"));
+    }
 }
diff --git a/...mapublic/test/sampledata/TargetedMS/panoramapublic/LibraryTest-DiaNN/DiaNNLibrary.sky.zip b/...mapublic/test/sampledata/TargetedMS/panoramapublic/LibraryTest-DiaNN/DiaNNLibrary.sky.zip
diff --git a/panoramapublic/test/sampledata/TargetedMS/panoramapublic/LibraryTest-DiaNN/README.txt b/panoramapublic/test/sampledata/TargetedMS/panoramapublic/LibraryTest-DiaNN/README.txt
@@ -0,0 +1 @@
+Files downloaded from https://panoramaweb.org/QuickProt_datasets.url (Ranish lab)
diff --git a/...ampledata/TargetedMS/panoramapublic/LibraryTest-DiaNN/no-prefix-match-report-for-test.tsv b/...ampledata/TargetedMS/panoramapublic/LibraryTest-DiaNN/no-prefix-match-report-for-test.tsv
@@ -0,0 +1,5 @@
+File.Name	Run	Protein.Group	Protein.Ids	Protein.Names	Genes	PG.Quantity	PG.Normalised	PG.MaxLFQ	Genes.Quantity	Genes.Normalised	Genes.MaxLFQ	Genes.MaxLFQ.Unique	Modified.Sequence	Stripped.Sequence	Precursor.Id	Precursor.Charge	Q.Value	PEP	Global.Q.Value	Protein.Q.Value	PG.Q.Value	Global.PG.Q.Value	GG.Q.Value	Translated.Q.Value	Proteotypic	Precursor.Quantity	Precursor.Normalised	Quantity.Quality	RT	RT.Start	RT.Stop	iRT	Predicted.RT	Predicted.iRT	First.Protein.Description	Lib.Q.Value	Lib.PG.Q.Value	Ms1.Profile.Corr	Ms1.Area	Ms1.Normalised	Normalisation.Factor	Evidence	Spectrum.Similarity	Averagine	Mass.Evidence	CScore	Fragment.Quant.Raw	Fragment.Correlations	MS2.Scan	IM	iIM	Predicted.IM	Predicted.iIM
+Z:\Omar\20241220\Raw data\D0_rep1_DIA.mzML	D0_rep1_DIA	P37108	P37108	SRP14_HUMAN	SRP14	2.06919e+07	3.43595e+07	2.92879e+07	2.06919e+07	3.43595e+07	2.92879e+07	2.92879e+07	AAAAAAAAAPAAAATAPTTAATTAATAAQ	AAAAAAAAAPAAAATAPTTAATTAATAAQ	AAAAAAAAAPAAAATAPTTAATTAATAAQ3	3	0.00682898	0.0389249	0.00525982	0.000484731	0.000438597	0.000218723	0.000438982	0	1	110686	212302	0.834495	40.7852	40.6788	40.8916	45.9386	40.6427	45.3926	Signal recognition particle 14 kDa protein	0.00462798	0.000176305	0.28009	81750.1	156802	1.91806	1.68953	0.127559	0.0508348	0	0.928589	0;13949.7;0;11208.5;0;0;0;9458.04;0;0;0;0;	0;0.654654;0;0.654654;0;0;0;0.654654;0;0;0;0;	57730	0	0	0	0
+Z:\Omar\20241220\Raw data\D0_rep2_DIA.mzML	D0_rep2_DIA	P37108	P37108	SRP14_HUMAN	SRP14	1.60054e+07	2.4481e+07	1.99438e+07	1.60054e+07	2.4481e+07	1.99438e+07	1.99438e+07	AAAAAAAAAPAAAATAPTTAATTAATAAQ	AAAAAAAAAPAAAATAPTTAATTAATAAQ	AAAAAAAAAPAAAATAPTTAATTAATAAQ3	3	0.00227236	0.0166455	0.00525982	0.000449035	0.000411184	0.000218723	0.000411523	0	1	308546	381628	0.819633	41.4875	41.2747	41.7003	45.9386	41.4297	44.7753	Signal recognition particle 14 kDa protein	0.00462798	0.000176305	0.374476	234681	290267	1.23686	1.15183	0.138951	0.0769019	0	0.982622	0;51162.2;0;25079.7;25917.9;0;0;0;0;0;11474;200182;	0;0.380268;0;0.0403012;0.259276;0;0;0;0;0;-0.0343988;0.00347259;	58712	0	0	0	0
+Z:\Omar\20241220\Raw data\D2_rep2_DIA.mzML	D2_rep2_DIA	P37108	P37108	SRP14_HUMAN	SRP14	5.96182e+07	4.16815e+07	3.39897e+07	5.96182e+07	4.16815e+07	3.39897e+07	3.39897e+07	AAAAAAAAAPAAAATAPTTAATTAATAAQ	AAAAAAAAAPAAAATAPTTAATTAATAAQ	AAAAAAAAAPAAAATAPTTAATTAATAAQ3	3	0.00034626	0.000973613	0.00525982	0.000284657	0.000262881	0.000218723	0.000263089	0	1	1.01521e+06	480606	0.892339	40.6718	40.5124	40.7783	45.9386	40.9778	45.9936	Signal recognition particle 14 kDa protein	0.00462798	0.000176305	0.381614	915531	433417	0.473405	2.07644	0.502376	0.172652	0	0.997521	35164.8;117100;65947.1;121602;109001;0;0;114272;0;13159.4;0;25451.3;	0.790086;0.414912;0;0.441212;0.524186;0;0;0.778551;0;0;0;0;	57579	0	0	0	0
+Z:\Omar\20241220\Raw data\D4_rep2_DIA.mzML	D4_rep2_DIA	P37108	P37108	SRP14_HUMAN	SRP14	3.58914e+07	2.43299e+07	2.35164e+07	3.58914e+07	2.43299e+07	2.35164e+07	2.35164e+07	AAAAAAAAAPAAAATAPTTAATTAATAAQ	AAAAAAAAAPAAAATAPTTAATTAATAAQ	AAAAAAAAAPAAAATAPTTAATTAATAAQ3	3	0.000124016	0.00081362	0.00525982	0.000289603	0.000265887	0.000218723	0.000266028	0	1	565612	340708	0.89329	41.1094	40.9502	41.2158	45.9386	41.2654	45.7235	Signal recognition particle 14 kDa protein	0.00462798	0.000176305	0.675966	450953	271641	0.602372	1.9792	0.46964	0.0769019	0	0.997199	26633.2;137531;17003.4;78320.3;38846;0;0;65957.6;0;0;0;16332.8;	0.884287;0.151077;0;-0.211294;0.884635;0;0;0.523756;0;0;0;0;	58183	0	0	0	0
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Files downloaded from https://panoramaweb.org/QuickProt_datasets.url (Ranish lab)