Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
package org.labkey.panoramapublic.model.validation;

import org.jetbrains.annotations.NotNull;
import org.json.JSONObject;
import org.labkey.api.data.Container;
import org.labkey.panoramapublic.speclib.LibSourceFile;

import java.util.Objects;

// For table panoramapublic.speclibsourcefile
Expand Down Expand Up @@ -78,4 +83,16 @@ public int hashCode()
{
return Objects.hash(getSourceType(), getName());
}

@NotNull
public JSONObject toJSON(Container container)
{
JSONObject jsonObject = super.toJSON(container);
if (isIdFile() && LibSourceFile.DIANN_REPORT_TSV_PLACEHOLDER.equals(getName()) && !found())
{
jsonObject.put("statusDetails", "The DIA-NN TSV report must be in the same directory as the " +
".speclib, and share some leading characters in the file name");
}
return jsonObject;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -171,18 +171,40 @@ private static List<LibSourceFile> getLibSources(SpecLibReader libReader, ISpect
{
throw UnexpectedException.wrap(e, "Error reading source files from library file " + libFilePath.toString());
}
if (sourceFiles != null && sourceFiles.stream().anyMatch(LibSourceFile::isMaxQuantSearch))

if (sourceFiles == null) return null;

if (sourceFiles.stream().anyMatch(LibSourceFile::isMaxQuantSearch))
{
// For libraries built with MaxQuant search results we need to add additional files that are required for library building
Set<String> idFileNames = sourceFiles.stream().filter(LibSourceFile::hasIdFile).map(LibSourceFile::getIdFile).collect(Collectors.toSet());
for (String file: LibSourceFile.MAX_QUANT_ID_FILES)
for (String file : LibSourceFile.MAX_QUANT_ID_FILES)
{
if (!idFileNames.contains(file))
{
sourceFiles.add(new LibSourceFile(null, file, null));
}
}
}
else if (sourceFiles.stream().anyMatch(LibSourceFile::isDiannSearch))
{
// Building a library with DIA-NN results in Skyline requires a .speclib file and a report TSV file.
// The .blib file includes the name of .speclib but not the name of the report TSV file.
// Building a library without the TSV gives this error message in Skyline:
// "...the TSV report is required to read speclib files and must be in the same directory as the speclib
// and share some leading characters (e.g. somedata-tsv.speclib and somedata-report.tsv)..."

// At some point Skyline may start including the names of all source files in the .blib SQLite file,
// so first check if any TSV files were listed as sources in the .blib
boolean hasTsvFiles = sourceFiles.stream()
.anyMatch(file -> file.hasIdFile() && file.getIdFile().toLowerCase().endsWith(".tsv"));
if (!hasTsvFiles)
{
// If there is no TSV source listed in the .blib, then add a placeholder for the DIA-NN report file.
sourceFiles.add(new LibSourceFile(null, LibSourceFile.DIANN_REPORT_TSV_PLACEHOLDER, null));
}
}

return sourceFiles;
}

Expand Down Expand Up @@ -241,12 +263,29 @@ private void validateLibrarySources(List<LibSourceFile> sources, FileContentServ
String idFile = source.getIdFile();
if (source.hasIdFile() && !checkedFiles.contains(idFile))
{
if (LibSourceFile.DIANN_REPORT_TSV_PLACEHOLDER.equals(idFile)) continue; // We will look for this when we come to the .speclib file

checkedFiles.add(idFile);
Path path = getPath(idFile, rawFilesDirPaths, false, fcs);
SpecLibSourceFile sourceFile = new SpecLibSourceFile(idFile, PEPTIDE_ID);
sourceFile.setSpecLibValidationId(getId());
sourceFile.setPath(path != null ? path.toString() : DataFile.NOT_FOUND);
idFiles.add(sourceFile);

if (source.isDiannSearch())
{
// If this is a DIA-NN .speclib file, check for the required report TSV file.
// We are doing this because the .blib does not include the name of the report TSV file.
// We only know that: "the TSV report is required to read speclib files and must be in the
// same directory as the speclib and share some leading characters
// (e.g. somedata-tsv.speclib and somedata-report.tsv)"
Path reportFilePath = sourceFile.found() ? getDiannReportFilePath(path) : null;
SpecLibSourceFile diannReportSourceFile = new SpecLibSourceFile(LibSourceFile.DIANN_REPORT_TSV_PLACEHOLDER, PEPTIDE_ID);
diannReportSourceFile.setSpecLibValidationId(getId());
diannReportSourceFile.setPath(reportFilePath != null ? reportFilePath.toString() : DataFile.NOT_FOUND);
idFiles.add(diannReportSourceFile);
checkedFiles.add(idFile);
}
}
}
setSpectrumFiles(spectrumFiles);
Expand All @@ -266,6 +305,77 @@ private Path getPath(String name, Set<Path> rawFilesDirPaths, boolean isMaxquant
return null;
}

private static Path getDiannReportFilePath(Path speclibFilePath)
{
Path specLibFileDir = speclibFilePath.getParent();
try (Stream<Path> paths = Files.list(specLibFileDir))
{
List<Path> files = paths.filter(path -> Files.isRegularFile(path)).collect(Collectors.toList());
return getDiannReportFilePath(speclibFilePath.getFileName().toString(), files);
}
catch (IOException e)
{
throw UnexpectedException.wrap(e, "Error looking for DIA-NN report TSV file in " + specLibFileDir);
}
}

private static Path getDiannReportFilePath(String specLibFileName, List<Path> candidateFiles)
{
Map<Path, Integer> prefixLengthMap = getCommonPrefixLengthsForTsvFiles(candidateFiles, specLibFileName);

// Find the TSV file with the longest common prefix that also has the expected column headers in the first line
return prefixLengthMap.entrySet().stream()
.sorted((entry1, entry2) -> Integer.compare(entry2.getValue(), entry1.getValue())) // Sort descending by matching prefix length
.map(Map.Entry::getKey) // File paths
.filter(file -> hasRequiredHeaders(file)) // First line should have expected header columns
.findFirst() // Get the first file that meets the conditions
.orElse(null);
}

private static Map<Path, Integer> getCommonPrefixLengthsForTsvFiles(List<Path> files, String specLibFileName)
{
String specLibFileBaseName = FileUtil.getBaseName(specLibFileName); // Remove file extension
Map<Path, Integer> prefixLengthMap = new HashMap<>();
files.stream()
.filter(file -> file.getFileName().toString().toLowerCase().endsWith(".tsv")) // Ensure it's a TSV file
.forEach(file -> {
// Get the longest common prefix length
int commonPrefixLength = commonPrefixLength(specLibFileBaseName, FileUtil.getBaseName(file.getFileName().toString()));

if (commonPrefixLength > 0)
{
prefixLengthMap.put(file, commonPrefixLength);
}
});
return prefixLengthMap;
}

private static int commonPrefixLength(String s1, String s2)
{
int maxLength = Math.min(s1.length(), s2.length());
int index = 0;
while (index < maxLength && s1.charAt(index) == s2.charAt(index))
{
index++;
}
return index;
}

private static boolean hasRequiredHeaders(Path diannReportTsv)
{
try
{
// Read the first line of the file
String firstLine = Files.lines(diannReportTsv).findFirst().orElse("");
// Check if the first line has the expected header columns names
return List.of(firstLine.trim().split("\t")).containsAll(LibSourceFile.DIANN_REPORT_EXPECTED_HEADERS);
}
catch (IOException e)
{
throw UnexpectedException.wrap(e, "Error reading the first line of TSV file " + diannReportTsv);
}
}

private Path findInDirectoryTree(java.nio.file.Path rawFilesDirPath, String fileName, boolean allowBaseName)
{
try
Expand Down Expand Up @@ -459,6 +569,116 @@ public void testAccept()
assertTrue(accept("170428_DBS_cal_7a.d", "170428_DBS_cal_7a.d.zip"));
}

@Test
public void testCommonPrefixLength() throws IOException
{
Path testDataDir = getDiannTestFilesPath();

// The spec lib file name to compare against
String specLibFileName = "report-lib.parquet.skyline-for-test.speclib";

Path tsvFile1 = testDataDir.resolve("report-lib.tsv");
Path tsvFile2 = testDataDir.resolve("report-lib-for-test.tsv");
Path tsvFile3 = testDataDir.resolve("report-lib.parquet.tsv");
Path tsvFile4 = testDataDir.resolve("report-lib.parquet-test.tsv");
Path tsvFile5 = testDataDir.resolve("no-prefix-match-report.tsv");
Path nonTsvFile1 = testDataDir.resolve("report-lib.parquet.skyline-for-test.txt");
Path nonTsvFile2 = testDataDir.resolve("report.txt");
Path nonTsvFile3 = testDataDir.resolve(specLibFileName);

List<Path> files = List.of(tsvFile1, tsvFile2, tsvFile3, tsvFile4, tsvFile5, nonTsvFile1, nonTsvFile2, nonTsvFile3);

Map<Path, Integer> prefixLengthMap = SpecLibValidator.getCommonPrefixLengthsForTsvFiles(files, specLibFileName);
// Expect 4 TSV files in the list; files without a prefix match, and non-TSV files should be ignored.
assertEquals("Unexpected size of prefixLengthMap", 4, prefixLengthMap.size());

// File report-lib.tsv should have a common prefix "report-lib"
assertTrue(prefixLengthMap.containsKey(tsvFile1));
assertEquals("report-lib".length(), prefixLengthMap.get(tsvFile1).intValue());

// File report-lib-test.tsv should have a common prefix "report-lib"
assertTrue(prefixLengthMap.containsKey(tsvFile2));
assertEquals("report-lib".length(), prefixLengthMap.get(tsvFile2).intValue());

// File report-lib.parquet.tsv should have a common prefix "report-lib.parquet"
assertTrue(prefixLengthMap.containsKey(tsvFile3));
assertEquals("report-lib.parquet".length(), prefixLengthMap.get(tsvFile3).intValue());

// File report-lib.parquet-test.tsv should have a common prefix "report-lib.parquet"
assertTrue(prefixLengthMap.containsKey(tsvFile4));
assertEquals("report-lib.parquet".length(), prefixLengthMap.get(tsvFile4).intValue());

// File no-prefix-match-report.tsv should not have a common prefix
assertFalse(tsvFile5 + " does not share a prefix with " + specLibFileName, prefixLengthMap.containsKey(tsvFile5));

assertFalse(prefixLengthMap.containsKey(nonTsvFile1));
assertFalse(prefixLengthMap.containsKey(nonTsvFile2));
assertFalse(prefixLengthMap.containsKey(nonTsvFile3));

// List of files that do not share a common prefix with the speclib file
files = List.of(testDataDir.resolve("abcd.tsv"), testDataDir.resolve("1234.tsv"), testDataDir.resolve("lib.parquet.skyline.tsv"));
prefixLengthMap = SpecLibValidator.getCommonPrefixLengthsForTsvFiles(files, specLibFileName);
assertEquals(0, prefixLengthMap.size());

prefixLengthMap = SpecLibValidator.getCommonPrefixLengthsForTsvFiles(files, specLibFileName);
assertEquals(0, prefixLengthMap.size());
}

@Test
public void testGetDiannReportFilePath() throws IOException
{
Path testDataDir = getDiannTestFilesPath();
String specLibFileName = "report-lib.parquet.skyline-for-test.speclib";

Path reportTsvFile = SpecLibValidator.getDiannReportFilePath(specLibFileName, Collections.emptyList());
assertNull("Unexpected report TSV file path returned. Input file list is empty.", reportTsvFile);

// TSV Files in the test directory
Path tsvFile1 = testDataDir.resolve("report.tsv");
Path tsvFile2 = testDataDir.resolve("report-lib-for-test.tsv");
Path tsvFile3 = testDataDir.resolve("no-prefix-match-report-for-test.tsv");
Path tsvFile4 = testDataDir.resolve("report-lib.parquet-missing-headers.txt");
// Non-TSV files in the test directory
Path nonTsvFile1 = testDataDir.resolve("report.txt");
Path nonTsvFile2 = testDataDir.resolve("report-lib.parquet.skyline-for-test.txt");
Path nonTsvFile3 = testDataDir.resolve(specLibFileName);
Path nonTsvFile4 = testDataDir.resolve("test_diann_library.blib");

List<Path> candidateFiles = new ArrayList<>();
candidateFiles.add(nonTsvFile1);
candidateFiles.add(nonTsvFile2);
candidateFiles.add(nonTsvFile3);
candidateFiles.add(nonTsvFile4);

assertNull("Unexpected report TSV file path returned. Input list does not have any TSV files",
SpecLibValidator.getDiannReportFilePath(specLibFileName, candidateFiles));

candidateFiles.add(tsvFile3); // TSV file does not share a prefix with the speclib file
assertNull("Unexpected report TSV file path returned. Input list does not have any TSV files that share a prefix with the speclib file",
SpecLibValidator.getDiannReportFilePath(specLibFileName, candidateFiles));

candidateFiles.add(tsvFile4); // TSV file does not have the required column headers
assertNull("Unexpected report TSV file path returned. Input list does not have any TSV files that share a prefix with the speclib file" +
" and have the required column headers",
SpecLibValidator.getDiannReportFilePath(specLibFileName, candidateFiles));

candidateFiles.add(tsvFile1); // Shares a prefix and has the required column headers
reportTsvFile = SpecLibValidator.getDiannReportFilePath(specLibFileName, candidateFiles);
assertNotNull(reportTsvFile);
assertEquals(tsvFile1, reportTsvFile);

candidateFiles.add(tsvFile2); // Shares a longer prefix with the speclib file
reportTsvFile = SpecLibValidator.getDiannReportFilePath(specLibFileName, candidateFiles);
assertNotNull(reportTsvFile);
assertEquals(tsvFile2, reportTsvFile);
}

private static Path getDiannTestFilesPath() throws IOException
{
return JunitUtil.getSampleData(ModuleLoader.getInstance().getModule(PanoramaPublicModule.class),
"TargetedMS/panoramapublic/LibraryTest-DiaNN").toPath();
}

private ISpectrumLibrary createLibrary(Path path)
{
return new ISpectrumLibrary()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,4 +81,14 @@ public boolean isMaxQuantSearch()
{
return (hasIdFile() && getIdFile().endsWith("msms.txt")) || containsScoreType("MAXQUANT SCORE");
}

public static String DIANN_REPORT_TSV_PLACEHOLDER = "DIA-NN report file";

// These are some of the column headers that we expect to see in a DIA-NN report TSV file
public static List<String> DIANN_REPORT_EXPECTED_HEADERS = List.of("File.Name", "Run", "Protein.Group", "Protein.Ids", "Protein.Names");

public boolean isDiannSearch()
{
return (hasIdFile() && getIdFile().toLowerCase().endsWith(".speclib"));
}
}
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Files downloaded from https://panoramaweb.org/QuickProt_datasets.url (Ranish lab)
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
File.Name Run Protein.Group Protein.Ids Protein.Names Genes PG.Quantity PG.Normalised PG.MaxLFQ Genes.Quantity Genes.Normalised Genes.MaxLFQ Genes.MaxLFQ.Unique Modified.Sequence Stripped.Sequence Precursor.Id Precursor.Charge Q.Value PEP Global.Q.Value Protein.Q.Value PG.Q.Value Global.PG.Q.Value GG.Q.Value Translated.Q.Value Proteotypic Precursor.Quantity Precursor.Normalised Quantity.Quality RT RT.Start RT.Stop iRT Predicted.RT Predicted.iRT First.Protein.Description Lib.Q.Value Lib.PG.Q.Value Ms1.Profile.Corr Ms1.Area Ms1.Normalised Normalisation.Factor Evidence Spectrum.Similarity Averagine Mass.Evidence CScore Fragment.Quant.Raw Fragment.Correlations MS2.Scan IM iIM Predicted.IM Predicted.iIM
Z:\Omar\20241220\Raw data\D0_rep1_DIA.mzML D0_rep1_DIA P37108 P37108 SRP14_HUMAN SRP14 2.06919e+07 3.43595e+07 2.92879e+07 2.06919e+07 3.43595e+07 2.92879e+07 2.92879e+07 AAAAAAAAAPAAAATAPTTAATTAATAAQ AAAAAAAAAPAAAATAPTTAATTAATAAQ AAAAAAAAAPAAAATAPTTAATTAATAAQ3 3 0.00682898 0.0389249 0.00525982 0.000484731 0.000438597 0.000218723 0.000438982 0 1 110686 212302 0.834495 40.7852 40.6788 40.8916 45.9386 40.6427 45.3926 Signal recognition particle 14 kDa protein 0.00462798 0.000176305 0.28009 81750.1 156802 1.91806 1.68953 0.127559 0.0508348 0 0.928589 0;13949.7;0;11208.5;0;0;0;9458.04;0;0;0;0; 0;0.654654;0;0.654654;0;0;0;0.654654;0;0;0;0; 57730 0 0 0 0
Z:\Omar\20241220\Raw data\D0_rep2_DIA.mzML D0_rep2_DIA P37108 P37108 SRP14_HUMAN SRP14 1.60054e+07 2.4481e+07 1.99438e+07 1.60054e+07 2.4481e+07 1.99438e+07 1.99438e+07 AAAAAAAAAPAAAATAPTTAATTAATAAQ AAAAAAAAAPAAAATAPTTAATTAATAAQ AAAAAAAAAPAAAATAPTTAATTAATAAQ3 3 0.00227236 0.0166455 0.00525982 0.000449035 0.000411184 0.000218723 0.000411523 0 1 308546 381628 0.819633 41.4875 41.2747 41.7003 45.9386 41.4297 44.7753 Signal recognition particle 14 kDa protein 0.00462798 0.000176305 0.374476 234681 290267 1.23686 1.15183 0.138951 0.0769019 0 0.982622 0;51162.2;0;25079.7;25917.9;0;0;0;0;0;11474;200182; 0;0.380268;0;0.0403012;0.259276;0;0;0;0;0;-0.0343988;0.00347259; 58712 0 0 0 0
Z:\Omar\20241220\Raw data\D2_rep2_DIA.mzML D2_rep2_DIA P37108 P37108 SRP14_HUMAN SRP14 5.96182e+07 4.16815e+07 3.39897e+07 5.96182e+07 4.16815e+07 3.39897e+07 3.39897e+07 AAAAAAAAAPAAAATAPTTAATTAATAAQ AAAAAAAAAPAAAATAPTTAATTAATAAQ AAAAAAAAAPAAAATAPTTAATTAATAAQ3 3 0.00034626 0.000973613 0.00525982 0.000284657 0.000262881 0.000218723 0.000263089 0 1 1.01521e+06 480606 0.892339 40.6718 40.5124 40.7783 45.9386 40.9778 45.9936 Signal recognition particle 14 kDa protein 0.00462798 0.000176305 0.381614 915531 433417 0.473405 2.07644 0.502376 0.172652 0 0.997521 35164.8;117100;65947.1;121602;109001;0;0;114272;0;13159.4;0;25451.3; 0.790086;0.414912;0;0.441212;0.524186;0;0;0.778551;0;0;0;0; 57579 0 0 0 0
Z:\Omar\20241220\Raw data\D4_rep2_DIA.mzML D4_rep2_DIA P37108 P37108 SRP14_HUMAN SRP14 3.58914e+07 2.43299e+07 2.35164e+07 3.58914e+07 2.43299e+07 2.35164e+07 2.35164e+07 AAAAAAAAAPAAAATAPTTAATTAATAAQ AAAAAAAAAPAAAATAPTTAATTAATAAQ AAAAAAAAAPAAAATAPTTAATTAATAAQ3 3 0.000124016 0.00081362 0.00525982 0.000289603 0.000265887 0.000218723 0.000266028 0 1 565612 340708 0.89329 41.1094 40.9502 41.2158 45.9386 41.2654 45.7235 Signal recognition particle 14 kDa protein 0.00462798 0.000176305 0.675966 450953 271641 0.602372 1.9792 0.46964 0.0769019 0 0.997199 26633.2;137531;17003.4;78320.3;38846;0;0;65957.6;0;0;0;16332.8; 0.884287;0.151077;0;-0.211294;0.884635;0;0;0.523756;0;0;0;0; 58183 0 0 0 0
Loading