diff --git a/README.md b/README.md index 6f9a2f7..a94901d 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # DKFZ SNVCalling Workflow -An SNV calling workflow developed in the Applied Bioinformatics and Theoretical Bioinformatics groups at the DKFZ. An earlier version (pre Github) of this workflow was used in the [Pancancer](https://dockstore.org/containers/quay.io/pancancer/pcawg-dkfz-workflow) project. The workflow is only suited for human data (hg37/hg19; some later versions also hg38), because of the big role annotations play in this workflow. +An SNV calling workflow developed in the Applied Bioinformatics and Theoretical Bioinformatics groups at the DKFZ. An earlier version (pre Github) of this workflow was used in the [Pancancer](https://dockstore.org/containers/quay.io/pancancer/pcawg-dkfz-workflow) project. The workflow is only suited for human data (hg37/GRCh37/hg19; and also for hg38/GRCh38), because of the big role annotations play in this workflow. >
de.NBI logoYour opinion matters! The development of this workflow is supported by the German Network for Bioinformatic Infrastructure (de.NBI). By completing this very short (30-60 seconds) survey you support our efforts to improve this tool.
@@ -114,6 +114,8 @@ Please have a look at the `resources/configurationFiles/analysisSNVCalling.xml` ## Example Call +For hg19, + ```bash roddy.sh run projectConfigurationName@analysisName patientId \ --useconfig=/path/to/your/applicationProperties.ini \ @@ -123,6 +125,22 @@ roddy.sh run projectConfigurationName@analysisName patientId \ --cvalues="bamfile_list:/path/to/your/control.bam;/path/to/your/tumor.bam,sample_list:normal;tumor,possibleTumorSampleNamePrefixes:tumor,possibleControlSampleNamePrefixes:normal,REFERENCE_GENOME:/reference/data/hs37d5_PhiX.fa,CHROMOSOME_LENGTH_FILE:/reference/data/hs37d5_PhiX.chromSizes,extractSamplesFromOutputFiles:false" ``` +For hg38, add the following workflow configs to the `projectConfigs` + +```xml + + + +``` + +```bash +roddy.sh run projectConfigurationName@analysisName patientId \ + --useconfig=/path/to/your/applicationProperties.ini \ + --configurationDirectories=/path/to/your/projectConfigs \ + --useiodir=/input/directory,/output/directory/snv \ + --cvalues="bamfile_list:/path/to/your/control.bam;/path/to/your/tumor.bam,sample_list:normal;tumor,possibleTumorSampleNamePrefixes:tumor,possibleControlSampleNamePrefixes:normal,REFERENCE_GENOME:/reference/data/GRCh38_decoy_ALT_HLA_PhiX.fa,CHROMOSOME_LENGTH_FILE:/reference/data/GRCh38_decoy_ALT_HLA_PhiX.chromSizes,extractSamplesFromOutputFiles:false" +``` + ### No Control * Set the parameter `isNoControlWorkflow` to `true`. @@ -157,6 +175,20 @@ The optional configuration JSON file defaults to the `convertToStdVCF.json` resi * patch: Remove all code related to PyPy and hts-python (including `copysam.py` and `PYPY_OR_PYTHON_BINARY`) + +* 3.0.0 + + * Major + * Support for hg38/GRCh38 reference genome and variant calling from ALT and HLA contigs. + * Minor + * For hg38: Removing mappability and repeat elements' annotations from penalty calculations. + * skipREMAP: Option to remove repeat elements and mappability from confidence annotations in hg19. + * Removing EVS And ExAC AF from the annotations and no-control workflow filtering + * Support for variant calling from CRAM files + * Bug fix: Removing "quote" around the raw filter option `` + * Update `COWorkflowsBasePlugin` to `1.4.2` + + * 2.2.0 * minor: Update virtualenv diff --git a/buildinfo.txt b/buildinfo.txt index 7730d00..df81708 100755 --- a/buildinfo.txt +++ b/buildinfo.txt @@ -1,4 +1,4 @@ -dependson=COWorkflowsBasePlugin:1.3.0 +dependson=COWorkflowsBasePlugin:1.4.2 JDKVersion=1.8 GroovyVersion=2.4 RoddyAPIVersion=3.4 diff --git a/buildversion.txt b/buildversion.txt index 228b97b..fd8c949 100755 --- a/buildversion.txt +++ b/buildversion.txt @@ -1,2 +1,2 @@ -2.1 +3.0 0 diff --git a/resources/analysisTools/snvPipeline/PurityReloaded.py b/resources/analysisTools/snvPipeline/PurityReloaded.py index e3aadf9..f67c907 100755 --- a/resources/analysisTools/snvPipeline/PurityReloaded.py +++ b/resources/analysisTools/snvPipeline/PurityReloaded.py @@ -251,6 +251,10 @@ def parseVcf(file,num): while (l!= ""): t=l.split('\t') if (t[0][0] != "#") and isValid(t): + # Skipping the non-primary assembly variants from purity calculations + if t[0].startswith('HLA') or t[0].endswith('_alt'): + l=vcf.readline() + continue i = chromMap[t[0]] if (t[12]=="germline"): #DP5=string.split(string.split(t[11],";")[1],",") diff --git a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py index deb9c98..5c1c5a1 100755 --- a/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py +++ b/resources/analysisTools/snvPipeline/confidenceAnnotation_SNVs.py @@ -32,7 +32,15 @@ def extract_info(info, keys, sep=";"): rtn = '0' if rtn == "None" else rtn return rtn +def validate_refgenome(refname): + valid_refgenome = ('hs37d5', 'GRCh38') + if refname not in valid_refgenome: + raise ValueError('Reference name (--refgenome) is not valid: %s. Valid reference genome names are %s' % (refname, ', '.join(valid_refgenome))) + def main(args): + + validate_refgenome(args.refgenome[0]) + if args.pancanout is not None and not args.no_makehead: header = '##fileformat=VCFv4.1\n' \ '##fileDate=' + time.strftime("%Y%m%d") + '\n' \ @@ -54,7 +62,7 @@ def main(args): '##FORMAT=\n' \ '##FORMAT=\n' \ '##FILTER=\n' \ - '##FILTER=\n' \ + '##FILTER=\n' \ '##FILTER=\n' \ '##FILTER=\n' \ '##FILTER=\n' \ @@ -75,6 +83,7 @@ def main(args): '##FILTER=\n' \ '##FILTER=\n' \ '##FILTER=\n' \ + '##FILTER=0.1%) or in local control database (>0.05%)">\n' \ '##SAMPLE=\n' \ '##SAMPLE=\n'\ '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t' @@ -98,20 +107,25 @@ def main(args): if line[0] == "#": headers = list(line[1:].rstrip().split('\t')) - fixed_headers = ["^INFO$", "MAPABILITY", "HISEQDEPTH", "SIMPLE_TANDEMREPEATS", "REPEAT_MASKER", "DUKE_EXCLUDED", - "DAC_BLACKLIST", "SELFCHAIN", "^CONFIDENCE$", "^RECLASSIFICATION$", "^PENALTIES$", + fixed_headers = ["^INFO$", "MAPABILITY", "SIMPLE_TANDEMREPEATS", "REPEAT_MASKER", + "^CONFIDENCE$", "^RECLASSIFICATION$", "^PENALTIES$", "^seqBiasPresent$", "^seqingBiasPresent$", "^seqBiasPresent_1$", "^seqingBiasPresent_1$", "^seqBiasPresent_2$", "^seqingBiasPresent_2$"] + + if args.refgenome[0] == "hs37d5": + fixed_headers = ["^INFO$", "MAPABILITY", "HISEQDEPTH", "SIMPLE_TANDEMREPEATS", "REPEAT_MASKER", "DUKE_EXCLUDED", + "DAC_BLACKLIST", "SELFCHAIN", "^CONFIDENCE$", "^RECLASSIFICATION$", "^PENALTIES$", + "^seqBiasPresent$", "^seqingBiasPresent$", "^seqBiasPresent_1$", "^seqingBiasPresent_1$", + "^seqBiasPresent_2$", "^seqingBiasPresent_2$"] + variable_headers = { "ANNOVAR_SEGDUP_COL": "^SEGDUP$", "KGENOMES_COL": "^1K_GENOMES$", "DBSNP_COL": "^DBSNP$", } - if args.no_control: - variable_headers["ExAC_COL"] = "^ExAC$" - variable_headers["EVS_COL"] = "^EVS$" + if args.no_control or args.refgenome[0] == 'GRCh38' or args.skipREMAP: variable_headers["GNOMAD_EXOMES_COL"] = "^GNOMAD_EXOMES$" variable_headers["GNOMAD_GENOMES_COL"] = "^GNOMAD_GENOMES$" variable_headers["LOCALCONTROL_WGS_COL"] = "^LocalControlAF_WGS$" - variable_headers["LOCALCONTROL_WES_COL"] = "^LocalControlAF_WES$" - else: + variable_headers["LOCALCONTROL_WES_COL"] = "^LocalControlAF_WES$" + if not args.no_control: fixed_headers += [ "^INFO_control", "^ANNOTATION_control$", ] header_indices = get_header_indices(headers, args.configfile, fixed_headers, variable_headers) @@ -188,16 +202,16 @@ def main(args): is_weird = False # coindicende with known artefact-producing regions if args.no_control: classification = "somatic" # start with default somatic - inExAC = False - inEVS = False - inGnomAD_WES = False - inGnomAD_WGS = False - inLocalControl_WGS = False - inLocalControl_WES = False else: # for potential re-classification (e.g. low coverage in control and in dbSNP => probably germline) classification = help["ANNOTATION_control"] # start with original classification + if args.no_control or args.refgenome[0] == 'GRCh38' or args.skipREMAP: + inGnomAD_WES = False + inGnomAD_WGS = False + inLocalControl_WES = False + inLocalControl_WGS = False + ### For pancancer # genotype tumor as originally from mpileup gttum = entries[9].split(":")[0] @@ -214,7 +228,7 @@ def main(args): in1KG = True if args.no_control: af = extract_info(help["KGENOMES_COL"].split("&")[0], "EUR_AF") - if af is not None and any(af > 0.01 for af in map(float, af.split(','))) > 0.01: + if af is not None and any(af > args.kgenome_maxMAF for af in map(float, af.split(','))): in1KG_AF = True infofield["1000G"] = "1000G" # dbSNP @@ -233,35 +247,28 @@ def main(args): if "COMMON=1" in help["DBSNP_COL"]: is_commonSNP = True - if args.no_control: + if args.no_control or args.refgenome[0] == 'GRCh38' or args.skipREMAP: if indbSNP and is_commonSNP and not is_clinic: - reasons += "dbSNP(NoControl)" - if help["ExAC_COL_VALID"] and any(af > 1.0 for af in map(float, extract_info(help["ExAC_COL"], "AF").split(','))): - inExAC = True - infofield["ExAC"] = "ExAC" - reasons += "ExAC(NoControl)" - if help["EVS_COL_VALID"] and any(af > 1.0 for af in map(float, extract_info(help["EVS_COL"], "MAF").split(','))): - inEVS = True - infofield["EVS"] = "EVS" - reasons += "EVS(NoControl)" - - if help["GNOMAD_EXOMES_COL_VALID"] and any(af > 0.001 for af in map(float, extract_info(help["GNOMAD_EXOMES_COL"], "AF").split(','))): + #reasons += "dbSNP(NoControl)" + pass + if help["GNOMAD_EXOMES_COL_VALID"] and any(af > args.gnomAD_WES_maxMAF for af in map(float, extract_info(help["GNOMAD_EXOMES_COL"], "AF").split(','))): inGnomAD_WES = True infofield["gnomAD_Exomes"] = "gnomAD_Exomes" - reasons += "gnomAD_Exomes(NoControl)" - if help["GNOMAD_GENOMES_COL_VALID"] and any(af > 0.001 for af in map(float, extract_info(help["GNOMAD_GENOMES_COL"], "AF").split(','))): + #reasons += "gnomAD_Exomes(NoControl)" + if help["GNOMAD_GENOMES_COL_VALID"] and any(af > args.gnomAD_WGS_maxMAF for af in map(float, extract_info(help["GNOMAD_GENOMES_COL"], "AF").split(','))): inGnomAD_WGS = True infofield["gnomAD_Genomes"] = "gnomAD_Genomes" - reasons += "gnomAD_Genomes(NoControl)" + #reasons += "gnomAD_Genomes(NoControl)" - if help["LOCALCONTROL_WGS_COL_VALID"] and any(af > 0.01 for af in map(float, extract_info(help["LOCALCONTROL_WGS_COL"], "AF").split(','))): + if help["LOCALCONTROL_WGS_COL_VALID"] and any(af > args.localControl_WGS_maxMAF for af in map(float, extract_info(help["LOCALCONTROL_WGS_COL"], "AF").split(','))): inLocalControl_WGS = True infofield["LocalControl_WGS"] = "LocalControl_WGS" - reasons += "LocalControl_WGS(NoControl)" - if help["LOCALCONTROL_WES_COL_VALID"] and any(af > 0.01 for af in map(float, extract_info(help["LOCALCONTROL_WES_COL"], "AF").split(','))): + #reasons += "LocalControl_WGS(NoControl)" + if help["LOCALCONTROL_WES_COL_VALID"] and any(af > args.localControl_WES_maxMAF for af in map(float, extract_info(help["LOCALCONTROL_WES_COL"], "AF").split(','))): inLocalControl_WES = True infofield["LocalControl_WES"] = "LocalControl_WES" - reasons += "LocalControl_WES(NoControl)" + #reasons += "LocalControl_WES(NoControl)" + # Punish for biases round 1 if idx_pcrbias != -1 and idx_seqbias != -1 and args.round == 1: @@ -308,90 +315,103 @@ def main(args): reasons += "bias_filter_round2_SEQ(-3)" filterfield["BI"] = 1 - # 2) annotations of regions that cause problems: some classes of repeats from RepeatMasker track, - # segmental duplications, (cf. Reumers et al. 2012, Nature Biotech 30:61), external blacklists, mapability - # simple repeats and low complexity (not the same as homopolymer, but similar enough); - # some satellites are not annotated in blacklist ... - if any(word in help["REPEAT_MASKER"] for word in ["Simple_repeat", "Low_", "Satellite", ]): - is_repeat = True - confidence -= 2 - reasons += "Simple_repeat(-2)" - filterfield["RE"] = 1 - # other repeat elements to penalize at least a bit - elif help["REPEAT_MASKER_VALID"]: - confidence -= 1 - reasons += "Other_repeat(-1)" - filterfield["RE"] = 1 - - # simple tandem repeats most often coincide with other bad features - do not penalize twice - if help["SIMPLE_TANDEMREPEATS_VALID"]: - is_STR = 1 - if not is_repeat: + # Only for hg19 reference genome + if args.refgenome[0] == "hs37d5" and not args.skipREMAP: + # 2) annotations of regions that cause problems: some classes of repeats from RepeatMasker track, + # segmental duplications, (cf. Reumers et al. 2012, Nature Biotech 30:61), external blacklists, mapability + # simple repeats and low complexity (not the same as homopolymer, but similar enough); + # some satellites are not annotated in blacklist ... + if any(word in help["REPEAT_MASKER"] for word in ["Simple_repeat", "Low_", "Satellite", ]): + is_repeat = True confidence -= 2 - reasons += "Tandem_repeat(-2)" + reasons += "Simple_repeat(-2)" filterfield["RE"] = 1 - - # Segmental Duplications are less effective than homopolymers, short tandem repeats and microsatellites, - # do not penality twice - if help["ANNOVAR_SEGDUP_COL_VALID"] and not (is_repeat or is_STR): - confidence -= 2 # bad region - is_weird = True - reasons += "Segmental_dup(-2)" - filterfield["RE"] = 1 - - # Duke excluded and ENCODE DAC blacklist, only consider if not already annotated as suspicious repeat - if help["DUKE_EXCLUDED_VALID"] or help["DAC_BLACKLIST_VALID"]: - confidence -= 3 # really bad region, usually centromeric repeats - is_weird = True - reasons += "Blacklist(-3)" - filterfield["BL"] = 1 - - # HiSeqDepth: regions "attracting" reads; often coincide with tandem repeats and CEN/TEL, - # not always with low mapability - if help["HISEQDEPTH_VALID"]: - confidence -= 3 # really really bad region! - is_weird = True - reasons += "Hiseqdepth(-3)" - filterfield["HSDEPTH"] = 1 - - # Mapability is 1 for unique regions, 0.5 for regions appearing twice, 0.33... 3times, ... - # Everything with really high number of occurences is artefacts - # does not always correlate with the above regions - # is overestimating badness bc. of _single_ end read simulations - if help["MAPABILITY"] == ".": - # in very rare cases (CEN), there is no mapability => ".", which is not numeric but interpreted as 0 - confidence -= 5 - reasons += "Not_mappable(-5)" - filterfield["MAP"] = 1 - else: - reduce = 0 - mapability = min(map(float, help["MAPABILITY"].split("&"))) - if mapability < 0.5: - # 0.5 does not seem to be that bad: region appears another time in - # the genome and we have paired end data! + # other repeat elements to penalize at least a bit + elif help["REPEAT_MASKER_VALID"]: confidence -= 1 - reduce += 1 + reasons += "Other_repeat(-1)" + filterfield["RE"] = 1 - is_weird = True # something _is_ weird already there and known SNPs might be artefacts + # simple tandem repeats most often coincide with other bad features - do not penalize twice + if help["SIMPLE_TANDEMREPEATS_VALID"]: + is_STR = 1 + if not is_repeat: + confidence -= 2 + reasons += "Tandem_repeat(-2)" + filterfield["RE"] = 1 + + # Segmental Duplications are less effective than homopolymers, short tandem repeats and microsatellites, + # do not penality twice + if help["ANNOVAR_SEGDUP_COL_VALID"] and not (is_repeat or is_STR): + confidence -= 2 # bad region + is_weird = True + reasons += "Segmental_dup(-2)" + filterfield["RE"] = 1 - if mapability < 0.4: # 3-4 times appearing region is worse but still not too bad + # Duke excluded and ENCODE DAC blacklist, only consider if not already annotated as suspicious repeat + if help["DUKE_EXCLUDED_VALID"] or help["DAC_BLACKLIST_VALID"]: + confidence -= 3 # really bad region, usually centromeric repeats + is_weird = True + reasons += "Blacklist(-3)" + filterfield["BL"] = 1 + + # HiSeqDepth: regions "attracting" reads; often coincide with tandem repeats and CEN/TEL, + # not always with low mapability + if help["HISEQDEPTH_VALID"]: + confidence -= 3 # really really bad region! + is_weird = True + reasons += "Hiseqdepth(-3)" + filterfield["HSDEPTH"] = 1 + + # Mapability is 1 for unique regions, 0.5 for regions appearing twice, 0.33... 3times, ... + # Everything with really high number of occurences is artefacts + # does not always correlate with the above regions + # is overestimating badness bc. of _single_ end read simulations + if help["MAPABILITY"] == ".": + # in very rare cases (CEN), there is no mapability => ".", which is not numeric but interpreted as 0 + confidence -= 5 + reasons += "Not_mappable(-5)" + filterfield["MAP"] = 1 + else: + reduce = 0 + mapability = min(map(float, help["MAPABILITY"].split("&"))) + if mapability < 0.5: + # 0.5 does not seem to be that bad: region appears another time in + # the genome and we have paired end data! confidence -= 1 reduce += 1 - if mapability < 0.25: # > 4 times appearing region - confidence -= 1 - reduce += 1 + is_weird = True # something _is_ weird already there and known SNPs might be artefacts - if mapability < 0.1: # > 5 times is bad - confidence -= 2 - reduce += 2 + if mapability < 0.4: # 3-4 times appearing region is worse but still not too bad + confidence -= 1 + reduce += 1 - if mapability < 0.05: # these regions are clearly very bad (Lego stacks) - confidence -= 3 - reduce += 3 + if mapability < 0.25: # > 4 times appearing region + confidence -= 1 + reduce += 1 - filterfield["MAP"] = 1 - reasons += "Low_mappability(%s=>-%d)"%(help["MAPABILITY"], reduce) + if mapability < 0.1: # > 5 times is bad + confidence -= 2 + reduce += 2 + + if mapability < 0.05: # these regions are clearly very bad (Lego stacks) + confidence -= 3 + reduce += 3 + + filterfield["MAP"] = 1 + reasons += "Low_mappability(%s=>-%d)"%(help["MAPABILITY"], reduce) + # Nov 2022: If the variant is present in the gnomAD or local control WGS + # the confidence are reduced and thrown out. This is implemented for hg38 and could be + # used with skipREMAP option for hg19. + # inLocalControl_WES: Needs to be generated from a new hg38 dataset + filterfield["FREQ"] = 0 + if(args.refgenome[0] == 'GRCh38' or args.skipREMAP): + if(inGnomAD_WES or inGnomAD_WGS or inLocalControl_WGS): + #reasons += 'commonSNP_or_technicalArtifact(-3)' + #classification = "SNP_support_germline" + #confidence -= 3 + filterfield["FREQ"] = 1 # if others have found the SNP already, it may be interesting despite low score # - but only if it's not a weird region. @@ -403,7 +423,7 @@ def main(args): if (args.no_control): # an SNV that is in dbSNP but not "clinic" or/and in 1 KG with high frequency is probably germline - if (in1KG_AF or (indbSNP and is_commonSNP and not is_clinic) or inExAC or inEVS or inGnomAD_WES or inGnomAD_WGS or inLocalControl_WES or inLocalControl_WGS): + if (in1KG_AF or (indbSNP and is_commonSNP and not is_clinic) or inGnomAD_WES or inGnomAD_WGS or inLocalControl_WGS or inLocalControl_WES): classification = "SNP_support_germline" # 3) information from the calls and germline comparisons: coverage, strand bias, variant support, .. @@ -585,7 +605,9 @@ def main(args): reasons += "Germline_ALT<0.3(-2)" filterfield["FRC"] = 1 if in1KG or (indbSNP and not (is_precious or is_clinic)): # but this supports again - number of reads may be low! - classification += "_SNP_support" + if filterfield['FREQ'] == 0: + classification += "_SNP_support" + if depthC <= 10: # very probably germline due to skewed distribution at low coverage classification += "_lowCov" # => can end up as "germline_SNP_support_lowCov" @@ -631,7 +653,7 @@ def main(args): #To make sure that changes in the raw filter do not influence the final result we punish them with -3 # TODO: JB: Why we use the second decimal of orignal value here? - if not args.runlowmaf and ((float(entries[5]) < 3 and (float("%.2f"%fr_var_tum) < 0.05 or (tvf + tvr < 5))) or (float(entries[5]) < 20 and ((tvf == 0 or tvr == 0) and (tvf + tvr <= 3)))): + if not args.runlowmaf and ((float(entries[5]) < 3 and (float("%.2f"%fr_var_tum) < 0.03 or (tvf + tvr < 3))) or (float(entries[5]) < 20 and ((tvf == 0 or tvr == 0) and (tvf + tvr <= 3)))): confidence -= 3 reasons += "raw_filter_punishment(-3)" filterfield["FRQ"] = 1 @@ -679,7 +701,7 @@ def main(args): filters_line = [] if entries[6] == "" else entries[6].split(';') if args.pancanout is not None: filters_pancan = [] - for filter in ("RE","BL","DP","SB","TAC","dbSNP","DB","HSDEPTH","MAP","SBAF","FRQ","TAR","UNCLEAR","DPHIGH","DPLOWC","1PS","ALTC","ALTCFR","FRC","YALT","VAF","BI"): + for filter in ("RE","BL","DP","SB","TAC","dbSNP","DB","HSDEPTH","MAP","SBAF","FRQ","TAR","UNCLEAR","DPHIGH","DPLOWC","1PS","ALTC","ALTCFR","FRC","YALT","VAF","BI", "FREQ"): if filterfield.get(filter, 0) == 1: if args.pancanout is not None: filters_pancan.append(filter) @@ -770,8 +792,10 @@ def main(args): parser.add_argument("-g", "--refgenome", dest="refgenome", nargs=2, default=["hs37d5", "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/" \ "phase2_reference_assembly_sequence/hs37d5.fa.gz", ], - help="reference genome used for calling ID, path (default hs37d5, ftp://ftp.1000genomes.ebi" \ - ".ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz)") + help="Argument's first value is used to determine if the workflow is using hg19 or hg38 reference genome. " \ + "Allowed first values are 'hs37d5' or 'GRCh37', second value could be the downloaded URL of the reference genome. " \ + "(default hs37d5, ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz)" \ + "") parser.add_argument("-z", "--center", dest="center", nargs="?", default="DKFZ", help="Center (unclear if the center where the data was produced or the center of the " \ "calling pipeline; default DKFZ).") @@ -794,5 +818,11 @@ def main(args): parser.add_argument("-x", "--runexome", dest="runexome", action="store_true", default=False, help="Run on exome, will turn off the high control coverage punishment " \ "and the PCR bias filter.") + parser.add_argument('--skipREMAP', dest='skipREMAP', action='store_true', default=False) + parser.add_argument("--gnomAD_WGS_maxMAF", dest="gnomAD_WGS_maxMAF", help="Max gnomAD WGS MAF", default=0.001, type=float) + parser.add_argument("--gnomAD_WES_maxMAF", dest="gnomAD_WES_maxMAF", help="Max gnomAD WES MAF", default=0.001, type=float) + parser.add_argument("--localControl_WGS_maxMAF", dest="localControl_WGS_maxMAF", help="Max local control WGS MAF", default=0.01, type=float) + parser.add_argument("--localControl_WES_maxMAF", dest="localControl_WES_maxMAF", help="Max local control WES MAF", default=0.01, type=float) + parser.add_argument("--1000genome_maxMAF", dest="kgenome_maxMAF", help="Max 1000 genome MAF", default=0.01, type=float) args = parser.parse_args() main(args) diff --git a/resources/analysisTools/snvPipeline/createErrorPlots.py b/resources/analysisTools/snvPipeline/createErrorPlots.py index 2ed2416..a528de6 100755 --- a/resources/analysisTools/snvPipeline/createErrorPlots.py +++ b/resources/analysisTools/snvPipeline/createErrorPlots.py @@ -31,7 +31,7 @@ def calculateLogSize(size, max_val, base): return math.log(((size/max_val)*(base-1.))+1., base) def calculateRootedSize(size, max_val): - if(float(size) != 0.0): + if(float(size) != 0.0 and float(max_val) != 0.0): return np.sqrt(size/max_val) else: return 0.0 @@ -199,6 +199,10 @@ def calculateErrorMatrix(vcfFilename, referenceFilename, errorType): # 23.05.2016 JB: Excluded multiallelic SNVs if ',' in split_line[header.index("ALT")]: continue + # 21.02.2023 NP: Excluded SNVs with 'N' before or after "," in context + if {'N,', ',N'}.intersection(split_line[header.index("SEQUENCE_CONTEXT")]): + continue + chrom = split_line[header.index("CHROM")] pos = int(split_line[header.index("POS")]) context = "" diff --git a/resources/analysisTools/snvPipeline/filter_PEoverlap.py b/resources/analysisTools/snvPipeline/filter_PEoverlap.py index d601522..6d19fcf 100755 --- a/resources/analysisTools/snvPipeline/filter_PEoverlap.py +++ b/resources/analysisTools/snvPipeline/filter_PEoverlap.py @@ -1,473 +1,478 @@ -#!/usr/bin/env python -# -# Copyright (c) 2018 German Cancer Research Center (DKFZ). -# -# Distributed under the MIT License (https://opensource.org/licenses/MIT). -# - - -# python /home/jaegern/pyWorkspace/NGS_Read_Processing/src/filter_PEoverlap.py --inf=snvs_108031.vcf --alignmentFile=/icgc/lsdf/mb/analysis/medullo/adultMB/results_per_pid/108031/alignment/tumor_108031_merged.bam.rmdup.bam --outf=snvs_108031_PEoverlapFiltered.vcf -# more snvs_108031.vcf | python /home/jaegern/pyWorkspace/NGS_Read_Processing/src/filter_PEoverlap.py --alignmentFile=/icgc/lsdf/mb/analysis/medullo/adultMB/results_per_pid/108031/alignment/tumor_108031_merged.bam.rmdup.bam --outf=snvs_108031_PEoverlapFiltered_nonALT_FINAL.vcf - - -import platform -if platform.python_implementation() == "PyPy": - import copysam as pysam -else: # "CPython" - import pysam - -import sys, os -from vcfparser import * -import re - -def listToTabsep(listItems, sep='\t'): - return sep.join(listItems) - -def qualFromASCII(ch): - return(ord(ch) - qualScoreOffset) - - -def transformQualStr(s): - return map(qualFromASCII,s) - -# Routine for getting index of ACGTNacgtn lists -def getIndexACGTNacgtn(is_reverse, is_read1, base): - if (is_reverse): - if(is_read1): - if(base == "a"): - return ["minus", 5] - elif(base == "c"): - return ["minus", 6] - elif(base == "g"): - return ["minus", 7] - elif(base == "t"): - return ["minus", 8] - elif(base == "n"): - return ["minus", 9] - else: - if(base == "a"): - return ["plus", 5] - elif(base == "c"): - return ["plus", 6] - elif(base == "g"): - return ["plus", 7] - elif(base == "t"): - return ["plus", 8] - elif(base == "n"): - return ["plus", 9] - else: - if(is_read1): - if(base == "a"): - return ["plus", 0] - elif(base == "c"): - return ["plus", 1] - elif(base == "g"): - return ["plus", 2] - elif(base == "t"): - return ["plus", 3] - elif(base == "n"): - return ["plus", 4] - else: - if(base == "a"): - return ["minus", 0] - elif(base == "c"): - return ["minus", 1] - elif(base == "g"): - return ["minus", 2] - elif(base == "t"): - return ["minus", 3] - elif(base == "n"): - return ["minus", 4] - -def decreaseDP4(remove_base, remove_is_reverse, REF, ALT, DP4rf, DP4rr, DP4af, DP4ar): - if remove_base.lower() == REF.lower(): - if remove_is_reverse: - # check if none of the 4 DP4 values are < 0 now, which can happen due to BAQ values instead of original base qualities, which are not part of the BAM file - if DP4rr > 0: DP4rr -= 1 - else: - if DP4rf > 0: DP4rf -= 1 - elif remove_base.lower() == ALT.lower(): - if remove_is_reverse: - if DP4ar > 0: DP4ar -= 1 - else: - if DP4af > 0: DP4af -= 1 - return(DP4rf, DP4rr, DP4af, DP4ar) - -class BoolCounter: - """ A class for counter objects """ - - def __init__(self, ref_base, alt_base): - self.ref_base = ref_base - self.alt_base = alt_base - self.ref_count = 0 - self.alt_count = 0 - - def update(self, current_base): - if self.ref_base == current_base: - self.ref_count += 1 - elif self.alt_base == current_base: - self.alt_count += 1 - - def update_nonREFnonALT(self, count): - self.alt_count =+ count - - def counted(self): - return [self.ref_count, self.alt_count] - -# MAIN ANALYSIS PROCEDURE -def performAnalysis(args): - global qualScoreOffset - - # http://stackoverflow.com/questions/881696/unbuffered-stdout-in-python-as-in-python-u-from-within-the-program - unbuffered = os.fdopen(sys.stdout.fileno(), 'w', 0) - sys.stdout = unbuffered - - if args.qualityScore == 'illumina': qualScoreOffset = 64 - elif args.qualityScore == 'phred': qualScoreOffset = 33 - - #vcfInFile = open(args.inf, "r") - #outFile = open(args.outf, "w") - - # Reference file for BAQ_recalcuation and local realignment - reference_file = pysam.Fastafile(args.refFileName) - - mode = "r" - multiple_iterators = False - # Setting pysam read mode based on the file extension - if args.alignmentFile.split(".")[-1] == "bam": - mode += "b" - elif args.alignmentFile.split(".")[-1] == "cram": - mode += "c" - samfile = pysam.Samfile(args.alignmentFile, mode) # This should work for BAM file only (with random access). - - if args.altPosF != '': - ALT_basePositions_file = args.altPosF - - if args.refPosF != '': - REF_basePositions_file = args.refPosF - - if args.altBQF != '': - ALT_baseQualities_file = args.altBQF - - if args.refBQF != '': - REF_baseQualities_file = args.refBQF - - - for line in sys.stdin: # vcfInFile - if line[0] == "#": - headers = list(line[1:].rstrip().split('\t')) - fixed_headers = ["^CHROM$", "^POS$", "^REF$", "^ALT$", "^INFO$", ] - if args.no_control: - fixed_headers += ["^CONFIDENCE$", "^RECLASSIFICATION$", ] - else: - fixed_headers += ["^ANNOTATION_control", ] - header_indices = get_header_indices(headers, args.configfile, fixed_headers) - sys.stdout.write(line) - continue # skip header from analysis - - entries = line.strip().split('\t') - parsed_line = LineParser(entries, header_indices) - - nonREFnonALTfwd=0 - nonREFnonALTrev=0 - ALTcount=0 - - ALT_basePositions=[] - REF_basePositions=[] - REF_baseQualities=[] - ALT_baseQualities=[] - - # how to treat multiallelic SNVs? Skipped in this current version... - if ((args.no_control and int(parsed_line["CONFIDENCE"]) > 7 and "somatic" in parsed_line["RECLASSIFICATION"]) or (not args.no_control and "somatic" in parsed_line["ANNOTATION_control"])) and len(parsed_line["ALT"]) == 1: - # DP=13;AF1=0.5;AC1=1;DP4=2,3,3,4;MQ=37;FQ=75;PV4=1,1,1,1 - info_values = parsed_line["INFO"].split(';') - for info_idx, info_value in enumerate(info_values): - if info_value[:4] == "DP4=": - DP4_idx = info_idx - DP4 = map(int, info_value[4:].split(',')) - DP4rf, DP4rr, DP4af, DP4ar = DP4 - DP4_original = re.sub('DP4', 'DP4original', info_value) # Keeping a backup of original DP4 - DP4_original_alt = DP4af + DP4ar - break - - chrom=parsed_line["CHROM"] - pos=int(parsed_line["POS"]) - REF=parsed_line["REF"] - ALT=parsed_line["ALT"] - - readNameHash={} - readMateHash={} # Hash to store read and mate starting positions for duplicate marking - readMateHash_qnameLocation={} # Hash to store the location of gname in the above hash list - - - ACGTNacgtn1 = [0]*10 - ACGTNacgtn2 = [0]*10 - count_PE = BoolCounter(REF, ALT) # Starting the counter for the forward and reverse reads removed due to PE overlap detection - count_supple = BoolCounter(REF, ALT) # "" for supplementary reads, since flag_filter is added, entire supplementary detection can be removed in future versions - count_mismatch = BoolCounter(REF, ALT) # " for mismatch report - count_nonREFnonALT = BoolCounter(REF, ALT) # " to count the non-ref and non-alt base at POS - - # To match pysam and mpileup counts, a reference file is added. Given the reference file, Pysam by default computes BAQ (compute_baq). - for pileupcolumn in samfile.pileup(chrom, (pos-1), pos, flag_filter=3844, redo_baq=True, ignore_overlaps=False, multiple_iterators=multiple_iterators): - if pileupcolumn.pos == (pos-1): - #print 'coverage at base %s = %s' % (pileupcolumn.pos , pileupcolumn.nsegments) - for pileupread in pileupcolumn.pileups: - if pileupread.is_del: - # 31 May 2016 JB: deletion at the pileup position - continue - baseScore = transformQualStr(pileupread.alignment.qual[pileupread.query_position])[0] - readpos = pileupread.query_position - if pileupread.alignment.seq[pileupread.query_position].lower() == ALT.lower(): - if args.altBQF != '': - ALT_baseQualities.append(baseScore) - if args.altPosF != '': - if pileupread.alignment.is_reverse: - readlength = len(pileupread.alignment.seq) - readpos = (readlength - readpos) - ALT_basePositions.append(readpos) - if pileupread.alignment.seq[pileupread.query_position].lower() == REF.lower(): - if args.refBQF != '': - REF_baseQualities.append(baseScore) - if args.refPosF != '': - if pileupread.alignment.is_reverse: - readlength = len(pileupread.alignment.seq) - readpos = (readlength - readpos) - REF_basePositions.append(readpos) - - if pileupread.alignment.mapq >= args.mapq: - # http://wwwfgu.anat.ox.ac.uk/~andreas/documentation/samtools/api.html USE qqual - try: - if transformQualStr(pileupread.alignment.qual[pileupread.query_position])[0] >= args.baseq: - # check if we consider this read as a proper read in terms of number of mismatches - if args.allowedNumberOfMismatches > -1: - numberOfMismatches = None - for tag in pileupread.alignment.tags: - if tag[0] == "NM": - numberOfMismatches = tag[1] - break - else: - continue - - if numberOfMismatches is not None: - if numberOfMismatches > args.allowedNumberOfMismatches: - remove_base = pileupread.alignment.seq[pileupread.query_position] - remove_is_reverse = pileupread.alignment.is_reverse - count_mismatch.update(remove_base) - (DP4rf, DP4rr, DP4af, DP4ar) = decreaseDP4(remove_base, remove_is_reverse, REF, ALT, DP4rf, DP4rr, DP4af, DP4ar) - # after decreasing the respective DP4 value, go directly to the next read - # without remembering the current read - # This will lead to an unknown read name when the paired read occurs at the same - # position. As we have already discarded the current high-mismatch read, we do not - # have to decrease DP4 values again, when the read partner occurs at the same SNV. - # We also do not increase ANCGTNacgtn for the discarded read. - continue - - # Check if pileupread.alignment is proper pair - if(pileupread.alignment.is_proper_pair): - # count to ACGTNacgtn list - is_reverse = pileupread.alignment.is_reverse - is_read1 = pileupread.alignment.is_read1 - base = pileupread.alignment.seq[pileupread.query_position].lower() - ACGTNacgtn_index = getIndexACGTNacgtn(is_reverse, is_read1, base) - if(ACGTNacgtn_index[0] == "plus"): - ACGTNacgtn1[ACGTNacgtn_index[1]] += 1 - else: - ACGTNacgtn2[ACGTNacgtn_index[1]] += 1 - - #if transformQualStr(pileupread.alignment.qual[pileupread.query_position])[0] >= args.baseq: # DEBUG July 23 2012: BROAD BAM problem due to pileupread.alignment.qqual being shorter sometimes than pileupread.alignment.qual - if(pileupread.alignment.query_name in readNameHash): - #print pileupread.alignment.query_name - old_qual = readNameHash[pileupread.alignment.query_name][0] - old_base = readNameHash[pileupread.alignment.query_name][1] - old_is_reverse = readNameHash[pileupread.alignment.query_name][2] - old_read_mate_tuple = readNameHash[pileupread.alignment.query_name][3] - current_qual = transformQualStr(pileupread.alignment.qual[pileupread.query_position])[0] - current_base = pileupread.alignment.seq[pileupread.query_position] - current_is_reverse = pileupread.alignment.is_reverse - current_read_mate_tuple = (pileupread.alignment.reference_id, pileupread.alignment.reference_start, pileupread.alignment.reference_end, pileupread.alignment.next_reference_id, pileupread.alignment.next_reference_start) - # if read name occurs twice for one variant, then due to overlapping PE reads, then subtract variant count from DP4 field - # if old_base is not equal to new_base remove the one with the smaller base quality - remove_base = None - remove_is_reverse = None - if(not(old_base == current_base)): - if(old_qual <= current_qual): - remove_base = old_base - remove_is_reverse = old_is_reverse - remove_old = True - else: - remove_base = current_base - remove_is_reverse = current_is_reverse - remove_old = False - else: - remove_base = current_base - remove_is_reverse = current_is_reverse - remove_old = False - - count_PE.update(remove_base) - (DP4rf, DP4rr, DP4af, DP4ar) = decreaseDP4(remove_base, remove_is_reverse, REF, ALT, DP4rf, DP4rr, DP4af, DP4ar) - # If current base is better, then removing the information about old mate - # If current base is not good, then do nothing - if remove_old: - old_location = readMateHash_qnameLocation[pileupread.alignment.query_name] - del readMateHash[old_read_mate_tuple][old_location] - read_mate_tuple_value = (current_qual, (current_base, current_is_reverse, pileupread.alignment.query_name, pileupread.alignment.is_supplementary)) - if current_read_mate_tuple in readMateHash: - readMateHash[current_read_mate_tuple].append(read_mate_tuple_value) - else: - readMateHash[current_read_mate_tuple] = [] - readMateHash[current_read_mate_tuple].append(read_mate_tuple_value) - - else: - # Store base quality, base, and read direction in readNameHash - base_qual_score=transformQualStr(pileupread.alignment.qual[pileupread.query_position])[0] - read_mate_tuple = (pileupread.alignment.reference_id, pileupread.alignment.reference_start, pileupread.alignment.reference_end, pileupread.alignment.next_reference_id, pileupread.alignment.next_reference_start) - read_mate_tuple_value = (base_qual_score, (pileupread.alignment.seq[pileupread.query_position], pileupread.alignment.is_reverse, pileupread.alignment.query_name, pileupread.alignment.is_supplementary)) - - readNameHash[pileupread.alignment.query_name] = [base_qual_score, pileupread.alignment.seq[pileupread.query_position], pileupread.alignment.is_reverse, read_mate_tuple] - - if read_mate_tuple in readMateHash: - readMateHash[read_mate_tuple].append(read_mate_tuple_value) - else: - readMateHash[read_mate_tuple] = [] - readMateHash[read_mate_tuple].append(read_mate_tuple_value) - - readMateHash_qnameLocation[pileupread.alignment.query_name] = len(readMateHash[read_mate_tuple]) - 1 # Location of the last pushed element in the array - - except IndexError: - "soft-clipped or trimmed base, not part of the high-qual alignemnt anyways, skip" - - if transformQualStr(pileupread.alignment.qual[pileupread.query_position])[0] >= args.baseq: - - if pileupread.alignment.seq[pileupread.query_position] == ALT: - ALTcount += 1 - # samtools mpileup sometimes counts bases as variants which are neither REF nor ALT - if (pileupread.alignment.seq[pileupread.query_position] != REF) and (pileupread.alignment.seq[pileupread.query_position] != ALT): - if pileupread.alignment.is_reverse: - nonREFnonALTrev += 1 - #if DP4ar > 0: DP4ar -= 1 - else: - nonREFnonALTfwd += 1 - #if DP4af > 0: DP4af -= 1 - - if (len(REF_baseQualities) > 0): - VAF = 1.0 * len(ALT_baseQualities) / (len(REF_baseQualities) + len(ALT_baseQualities)) - elif (len(ALT_baseQualities) > 0): - VAF = 1.0 - else: - VAF = 0.0 - - if args.altBQF != '': - scoreString = ",".join([str(score) for score in ALT_baseQualities]) - if scoreString != '': - scoreString = ";".join([scoreString , str(VAF)]) - ALT_baseQualities_file.write("%s\t%s\t%s\n" % (chrom, pos, scoreString)) - - if args.refBQF != '': - scoreString = ",".join([str(score) for score in REF_baseQualities]) - if scoreString != '': - scoreString = ";".join([scoreString , str(VAF)]) - REF_baseQualities_file.write("%s\t%s\t%s\n" % (chrom, pos, scoreString)) - - if args.altPosF != '': - positionsString = ",".join([str(readpos) for readpos in ALT_basePositions]) - if positionsString != '': - ALT_basePositions_file.write("%s\t%s\t%s\n" % (chrom, pos, positionsString)) - - if args.refPosF != '': - positionsString = ",".join([str(readpos) for readpos in REF_basePositions]) - if positionsString != '': - REF_basePositions_file.write("%s\t%s\t%s\n" % (chrom, pos, positionsString)) - - break # only one pileup for a position - - # Calculating duplicates based on read-mate pair's start positions (chr id and start location) - count_duplicate = BoolCounter(REF, ALT) - - for key in readMateHash: - value_length = len(readMateHash[key]) - if value_length > 0: - sorted_values = sorted(readMateHash[key], key=lambda x: x[0]) # Sorted based on base quality - sorted_values = sorted_values[:-1] # removing the read with highest quality, so it will be retained for count - for value in sorted_values: # Removing everthing else - qual_value, decreaseInfo = value - remove_base = decreaseInfo[0] - remove_is_reverse = decreaseInfo[1] - - count_duplicate.update(remove_base) - (DP4rf, DP4rr, DP4af, DP4ar) = decreaseDP4(remove_base, remove_is_reverse, REF, ALT, DP4rf, DP4rr, DP4af, DP4ar) - - if (DP4[2] + DP4[3]) > ALTcount: # that the ALTcount is larger happens often due to BAQ during samtools mpileup which doesn't change the base qual in the BAM file, but decreases base qual during calling - if DP4af >= nonREFnonALTfwd: DP4af -= nonREFnonALTfwd - - if DP4ar >= nonREFnonALTrev: DP4ar -= nonREFnonALTrev - - nonREFnonALT = nonREFnonALTrev + nonREFnonALTfwd - count_nonREFnonALT.update_nonREFnonALT(nonREFnonALT) - - #Format: DP2 -> "reference(forward + reverse), alt(forward + reverse)" - supple_dup_str = 'DP2sup=' + ','.join(map(str, count_supple.counted())) - supple_dup_str += ';DP2dup=' + ','.join(map(str, count_duplicate.counted())) - supple_dup_str += ';DP2pairEnd=' + ','.join(map(str, count_PE.counted())) - supple_dup_str += ';DP2mis=' + ','.join(map(str, count_mismatch.counted())) - supple_dup_str += ';DP2nonREFnonALT=' + ','.join(map(str, count_nonREFnonALT.counted())) - - ACGTNacgtn1_string = "ACGTNacgtnPLUS="+",".join([str(i) for i in ACGTNacgtn1]) - ACGTNacgtn2_string = "ACGTNacgtnMINUS="+",".join([str(i) for i in ACGTNacgtn2]) - - info_values[DP4_idx] = "DP4=" + str(DP4rf)+ "," + str(DP4rr)+ "," + str(DP4af)+ "," + str(DP4ar) - info_values.append(ACGTNacgtn1_string) - info_values.append(ACGTNacgtn2_string) - info_values.append(DP4_original) - info_values.append(supple_dup_str) - - entries[header_indices["INFO"]] = ';'.join(info_values) - - sys.stdout.write('\t'.join(entries) + '\n') - else: - sys.stdout.write(line) # write germline and somatic-multiallelic SNVs as is - - del ALT_basePositions - del REF_basePositions - del REF_baseQualities - del ALT_baseQualities - samfile.close() - - if args.altBQF is not None: - ALT_baseQualities_file.close() - - if args.refBQF is not None: - REF_baseQualities_file.close() - - if args.altPosF is not None: - ALT_basePositions_file.close() - - if args.refPosF is not None: - REF_basePositions_file.close() - - #vcfInFile.close() - #outFile.close() - -if __name__ == '__main__': - #print "Starting program...\n" - import argparse - parser = argparse.ArgumentParser() - #parser.add_option('--inf',action='store',type='string',dest='inf',help='Specify the name of the input vcf file containing all snvs (germline and somatic)',default='') - parser.add_argument('--alignmentFile',dest='alignmentFile',help='Specify the name of the BAM file containing bwa alignments, has to be the BAM file that was used to call the variants in the input vcf file - REQUIRED', required=True) - #parser.add_option('--outf',action='store',type='string',dest='outf',help='Specify the name of the output file, which will have same format as input vcf but with PE overlap filtered DP4 values if somatic and if snvs in PE overlap region',default='') - parser.add_argument('--referenceFile', dest='refFileName', help='Specify the full path of reference genome file in fasta format, with index in the same directory') - parser.add_argument('--mapq',type=int,dest='mapq',help='Specify the minimum mapping quality of bwa used for mpileup as parameter -q (default: 30 )',default=30) - parser.add_argument('--baseq',type=int,dest='baseq',help='Specify the minimum base quality scores used for mpileup as parameter -Q (default: 13)',default=13) - parser.add_argument('--qualityScore',dest='qualityScore',help='Specify whether the per base quality score is given in phred or illumina format (default is Illumina score: ASCII offset of 64, while PHRED scores have an ASCII offset of 33)',default='phred') - parser.add_argument('--maxNumberOfMismatchesInRead',type=int,dest='allowedNumberOfMismatches',help='Specify the number of mismatches that are allowed per read in order to consider this read. Value of -1 (default) turns this filter off.',default=-1) - parser.add_argument('--altBaseQualFile',nargs="?",type=argparse.FileType('w'),dest='altBQF',help='Specify the name of the output file for alternative allele base qualities.',default=None) - parser.add_argument('--refBaseQualFile',nargs="?",type=argparse.FileType('w'),dest='refBQF',help='Specify the name of the output file for reference allele base qualities.',default=None) - parser.add_argument('--altBasePositionsFile',nargs="?",type=argparse.FileType('w'),dest='altPosF',help='Specify the name of the output file for position within the reads for alternative bases.',default=None) - parser.add_argument('--refBasePositionsFile',nargs="?",type=argparse.FileType('w'),dest='refPosF',help='Specify the name of the output file for position within the reads for reference bases.',default=None) - parser.add_argument('--nocontrol',action='store_true',dest='no_control',help='Specify the workflow is run without control.',default=False) - parser.add_argument('--configFile',nargs="?",type=argparse.FileType('r'),dest='configfile',help='Specify the config file which contains header names.',default=None) - - - args = parser.parse_args() - performAnalysis(args) - #print "\nProgram successfully terminating...." - +#!/usr/bin/env python +# +# Copyright (c) 2018 German Cancer Research Center (DKFZ). +# +# Distributed under the MIT License (https://opensource.org/licenses/MIT). +# + + +# python /home/jaegern/pyWorkspace/NGS_Read_Processing/src/filter_PEoverlap.py --inf=snvs_108031.vcf --alignmentFile=/icgc/lsdf/mb/analysis/medullo/adultMB/results_per_pid/108031/alignment/tumor_108031_merged.bam.rmdup.bam --outf=snvs_108031_PEoverlapFiltered.vcf +# more snvs_108031.vcf | python /home/jaegern/pyWorkspace/NGS_Read_Processing/src/filter_PEoverlap.py --alignmentFile=/icgc/lsdf/mb/analysis/medullo/adultMB/results_per_pid/108031/alignment/tumor_108031_merged.bam.rmdup.bam --outf=snvs_108031_PEoverlapFiltered_nonALT_FINAL.vcf + + +import platform +if platform.python_implementation() == "PyPy": + import copysam as pysam +else: # "CPython" + import pysam + +import sys, os +from vcfparser import * +import re + +def listToTabsep(listItems, sep='\t'): + return sep.join(listItems) + +def qualFromASCII(ch): + return(ord(ch) - qualScoreOffset) + + +def transformQualStr(s): + return map(qualFromASCII,s) + +# Routine for getting index of ACGTNacgtn lists +def getIndexACGTNacgtn(is_reverse, is_read1, base): + if (is_reverse): + if(is_read1): + if(base == "a"): + return ["minus", 5] + elif(base == "c"): + return ["minus", 6] + elif(base == "g"): + return ["minus", 7] + elif(base == "t"): + return ["minus", 8] + elif(base == "n"): + return ["minus", 9] + else: + if(base == "a"): + return ["plus", 5] + elif(base == "c"): + return ["plus", 6] + elif(base == "g"): + return ["plus", 7] + elif(base == "t"): + return ["plus", 8] + elif(base == "n"): + return ["plus", 9] + else: + if(is_read1): + if(base == "a"): + return ["plus", 0] + elif(base == "c"): + return ["plus", 1] + elif(base == "g"): + return ["plus", 2] + elif(base == "t"): + return ["plus", 3] + elif(base == "n"): + return ["plus", 4] + else: + if(base == "a"): + return ["minus", 0] + elif(base == "c"): + return ["minus", 1] + elif(base == "g"): + return ["minus", 2] + elif(base == "t"): + return ["minus", 3] + elif(base == "n"): + return ["minus", 4] + +def decreaseDP4(remove_base, remove_is_reverse, REF, ALT, DP4rf, DP4rr, DP4af, DP4ar): + if remove_base.lower() == REF.lower(): + if remove_is_reverse: + # check if none of the 4 DP4 values are < 0 now, which can happen due to BAQ values instead of original base qualities, which are not part of the BAM file + if DP4rr > 0: DP4rr -= 1 + else: + if DP4rf > 0: DP4rf -= 1 + elif remove_base.lower() == ALT.lower(): + if remove_is_reverse: + if DP4ar > 0: DP4ar -= 1 + else: + if DP4af > 0: DP4af -= 1 + return(DP4rf, DP4rr, DP4af, DP4ar) + +class BoolCounter: + """ A class for counter objects """ + + def __init__(self, ref_base, alt_base): + self.ref_base = ref_base + self.alt_base = alt_base + self.ref_count = 0 + self.alt_count = 0 + + def update(self, current_base): + if self.ref_base == current_base: + self.ref_count += 1 + elif self.alt_base == current_base: + self.alt_count += 1 + + def update_nonREFnonALT(self, count): + self.alt_count =+ count + + def counted(self): + return [self.ref_count, self.alt_count] + +# MAIN ANALYSIS PROCEDURE +def performAnalysis(args): + global qualScoreOffset + + # http://stackoverflow.com/questions/881696/unbuffered-stdout-in-python-as-in-python-u-from-within-the-program + unbuffered = os.fdopen(sys.stdout.fileno(), 'w', 0) + sys.stdout = unbuffered + + if args.qualityScore == 'illumina': qualScoreOffset = 64 + elif args.qualityScore == 'phred': qualScoreOffset = 33 + + #vcfInFile = open(args.inf, "r") + #outFile = open(args.outf, "w") + + # Reference file for CRAM files + reference_file = args.refFileName + + mode = "r" + multiple_iterators = False + if args.alignmentFile.split(".")[-1] == "bam": + mode += "b" + samfile = pysam.Samfile(args.alignmentFile, mode) + elif args.alignmentFile.split(".")[-1] == "cram": + mode += "c" + samfile = pysam.Samfile(args.alignmentFile, mode, reference_filename = reference_file) + else: + raise "Unknown file alignment suffix '%s'. Need 'bam' or 'cram'" % args.alignmentFile.split(".")[-1] + + if args.altPosF != '': + ALT_basePositions_file = args.altPosF + + if args.refPosF != '': + REF_basePositions_file = args.refPosF + + if args.altBQF != '': + ALT_baseQualities_file = args.altBQF + + if args.refBQF != '': + REF_baseQualities_file = args.refBQF + + + for line in sys.stdin: # vcfInFile + if line[0] == "#": + headers = list(line[1:].rstrip().split('\t')) + fixed_headers = ["^CHROM$", "^POS$", "^REF$", "^ALT$", "^INFO$", ] + if args.no_control: + fixed_headers += ["^CONFIDENCE$", "^RECLASSIFICATION$", ] + else: + fixed_headers += ["^ANNOTATION_control", ] + header_indices = get_header_indices(headers, args.configfile, fixed_headers) + sys.stdout.write(line) + continue # skip header from analysis + + entries = line.strip().split('\t') + parsed_line = LineParser(entries, header_indices) + + nonREFnonALTfwd=0 + nonREFnonALTrev=0 + ALTcount=0 + + ALT_basePositions=[] + REF_basePositions=[] + REF_baseQualities=[] + ALT_baseQualities=[] + + # how to treat multiallelic SNVs? Skipped in this current version... + if ((args.no_control and int(parsed_line["CONFIDENCE"]) > 7 and "somatic" in parsed_line["RECLASSIFICATION"]) or (not args.no_control and "somatic" in parsed_line["ANNOTATION_control"])) and len(parsed_line["ALT"]) == 1: + # DP=13;AF1=0.5;AC1=1;DP4=2,3,3,4;MQ=37;FQ=75;PV4=1,1,1,1 + info_values = parsed_line["INFO"].split(';') + for info_idx, info_value in enumerate(info_values): + if info_value[:4] == "DP4=": + DP4_idx = info_idx + DP4 = map(int, info_value[4:].split(',')) + DP4rf, DP4rr, DP4af, DP4ar = DP4 + DP4_original = re.sub('DP4', 'DP4original', info_value) # Keeping a backup of original DP4 + DP4_original_alt = DP4af + DP4ar + break + + chrom=parsed_line["CHROM"] + pos=int(parsed_line["POS"]) + REF=parsed_line["REF"] + ALT=parsed_line["ALT"] + + readNameHash={} + readMateHash={} # Hash to store read and mate starting positions for duplicate marking + readMateHash_qnameLocation={} # Hash to store the location of gname in the above hash list + + + ACGTNacgtn1 = [0]*10 + ACGTNacgtn2 = [0]*10 + count_PE = BoolCounter(REF, ALT) # Starting the counter for the forward and reverse reads removed due to PE overlap detection + count_supple = BoolCounter(REF, ALT) # "" for supplementary reads, since flag_filter is added, entire supplementary detection can be removed in future versions + count_mismatch = BoolCounter(REF, ALT) # " for mismatch report + count_nonREFnonALT = BoolCounter(REF, ALT) # " for nonREF and nonALT bases + + # To match pysam and mpileup counts, a reference file is added. Given the reference file, Pysam by default computes BAQ (compute_baq). + if chrom.endswith('alt') or chrom.startswith('HLA'): + flag_filer_value = 1796 + else: + flag_filer_value = 3844 + for pileupcolumn in samfile.pileup(chrom, (pos-1), pos, flag_filter=flag_filer_value, redo_baq=True, ignore_overlaps=False, multiple_iterators=multiple_iterators): + if pileupcolumn.pos == (pos-1): + #print 'coverage at base %s = %s' % (pileupcolumn.pos , pileupcolumn.nsegments) + for pileupread in pileupcolumn.pileups: + if pileupread.is_del: + # 31 May 2016 JB: deletion at the pileup position + continue + baseScore = transformQualStr(pileupread.alignment.qual[pileupread.query_position])[0] + readpos = pileupread.query_position + if pileupread.alignment.seq[pileupread.query_position].lower() == ALT.lower(): + if args.altBQF != '': + ALT_baseQualities.append(baseScore) + if args.altPosF != '': + if pileupread.alignment.is_reverse: + readlength = len(pileupread.alignment.seq) + readpos = (readlength - readpos) + ALT_basePositions.append(readpos) + if pileupread.alignment.seq[pileupread.query_position].lower() == REF.lower(): + if args.refBQF != '': + REF_baseQualities.append(baseScore) + if args.refPosF != '': + if pileupread.alignment.is_reverse: + readlength = len(pileupread.alignment.seq) + readpos = (readlength - readpos) + REF_basePositions.append(readpos) + + if pileupread.alignment.mapq >= args.mapq: + # http://wwwfgu.anat.ox.ac.uk/~andreas/documentation/samtools/api.html USE qqual + try: + if transformQualStr(pileupread.alignment.qual[pileupread.query_position])[0] >= args.baseq: + # check if we consider this read as a proper read in terms of number of mismatches + if args.allowedNumberOfMismatches > -1: + numberOfMismatches = None + for tag in pileupread.alignment.tags: + if tag[0] == "NM": + numberOfMismatches = tag[1] + break + else: + continue + + if numberOfMismatches is not None: + if numberOfMismatches > args.allowedNumberOfMismatches: + remove_base = pileupread.alignment.seq[pileupread.query_position] + remove_is_reverse = pileupread.alignment.is_reverse + count_mismatch.update(remove_base) + (DP4rf, DP4rr, DP4af, DP4ar) = decreaseDP4(remove_base, remove_is_reverse, REF, ALT, DP4rf, DP4rr, DP4af, DP4ar) + # after decreasing the respective DP4 value, go directly to the next read + # without remembering the current read + # This will lead to an unknown read name when the paired read occurs at the same + # position. As we have already discarded the current high-mismatch read, we do not + # have to decrease DP4 values again, when the read partner occurs at the same SNV. + # We also do not increase ANCGTNacgtn for the discarded read. + continue + + # Check if pileupread.alignment is proper pair + if(pileupread.alignment.is_proper_pair): + # count to ACGTNacgtn list + is_reverse = pileupread.alignment.is_reverse + is_read1 = pileupread.alignment.is_read1 + base = pileupread.alignment.seq[pileupread.query_position].lower() + ACGTNacgtn_index = getIndexACGTNacgtn(is_reverse, is_read1, base) + if(ACGTNacgtn_index[0] == "plus"): + ACGTNacgtn1[ACGTNacgtn_index[1]] += 1 + else: + ACGTNacgtn2[ACGTNacgtn_index[1]] += 1 + + #if transformQualStr(pileupread.alignment.qual[pileupread.query_position])[0] >= args.baseq: # DEBUG July 23 2012: BROAD BAM problem due to pileupread.alignment.qqual being shorter sometimes than pileupread.alignment.qual + if(pileupread.alignment.query_name in readNameHash): + old_qual = readNameHash[pileupread.alignment.query_name][0] + old_base = readNameHash[pileupread.alignment.query_name][1] + old_is_reverse = readNameHash[pileupread.alignment.query_name][2] + old_read_mate_tuple = readNameHash[pileupread.alignment.query_name][3] + current_qual = transformQualStr(pileupread.alignment.qual[pileupread.query_position])[0] + current_base = pileupread.alignment.seq[pileupread.query_position] + current_is_reverse = pileupread.alignment.is_reverse + current_read_mate_tuple = (pileupread.alignment.reference_id, pileupread.alignment.reference_start, pileupread.alignment.reference_end, pileupread.alignment.next_reference_id, pileupread.alignment.next_reference_start) + # if read name occurs twice for one variant, then due to overlapping PE reads, then subtract variant count from DP4 field + # if old_base is not equal to new_base remove the one with the smaller base quality + remove_base = None + remove_is_reverse = None + if(not(old_base == current_base)): + if(old_qual <= current_qual): + remove_base = old_base + remove_is_reverse = old_is_reverse + remove_old = True + else: + remove_base = current_base + remove_is_reverse = current_is_reverse + remove_old = False + else: + remove_base = current_base + remove_is_reverse = current_is_reverse + remove_old = False + + count_PE.update(remove_base) + (DP4rf, DP4rr, DP4af, DP4ar) = decreaseDP4(remove_base, remove_is_reverse, REF, ALT, DP4rf, DP4rr, DP4af, DP4ar) + + # If current base is better, then removing the information about old mate + # If current base is not good, then do nothing + if remove_old: + old_location = readMateHash_qnameLocation[pileupread.alignment.query_name] + del readMateHash[old_read_mate_tuple][old_location] + read_mate_tuple_value = (current_qual, (current_base, current_is_reverse, pileupread.alignment.query_name, pileupread.alignment.is_supplementary)) + if current_read_mate_tuple in readMateHash: + readMateHash[current_read_mate_tuple].append(read_mate_tuple_value) + else: + readMateHash[current_read_mate_tuple] = [] + readMateHash[current_read_mate_tuple].append(read_mate_tuple_value) + + else: + # Store base quality, base, and read direction in readNameHash + base_qual_score=transformQualStr(pileupread.alignment.qual[pileupread.query_position])[0] + read_mate_tuple = (pileupread.alignment.reference_id, pileupread.alignment.reference_start, pileupread.alignment.reference_end, pileupread.alignment.next_reference_id, pileupread.alignment.next_reference_start) + read_mate_tuple_value = (base_qual_score, (pileupread.alignment.seq[pileupread.query_position], pileupread.alignment.is_reverse, pileupread.alignment.query_name, pileupread.alignment.is_supplementary)) + + readNameHash[pileupread.alignment.query_name] = [base_qual_score, pileupread.alignment.seq[pileupread.query_position], pileupread.alignment.is_reverse, read_mate_tuple] + + if read_mate_tuple in readMateHash: + readMateHash[read_mate_tuple].append(read_mate_tuple_value) + else: + readMateHash[read_mate_tuple] = [] + readMateHash[read_mate_tuple].append(read_mate_tuple_value) + + readMateHash_qnameLocation[pileupread.alignment.query_name] = len(readMateHash[read_mate_tuple]) - 1 # Location of the last pushed element in the array + + except IndexError: + "soft-clipped or trimmed base, not part of the high-qual alignemnt anyways, skip" + + if transformQualStr(pileupread.alignment.qual[pileupread.query_position])[0] >= args.baseq: + + if pileupread.alignment.seq[pileupread.query_position] == ALT: + ALTcount += 1 + # samtools mpileup sometimes counts bases as variants which are neither REF nor ALT + if (pileupread.alignment.seq[pileupread.query_position] != REF) and (pileupread.alignment.seq[pileupread.query_position] != ALT): + if pileupread.alignment.is_reverse: + nonREFnonALTrev += 1 + #if DP4ar > 0: DP4ar -= 1 + else: + nonREFnonALTfwd += 1 + #if DP4af > 0: DP4af -= 1 + + if (len(REF_baseQualities) > 0): + VAF = 1.0 * len(ALT_baseQualities) / (len(REF_baseQualities) + len(ALT_baseQualities)) + elif (len(ALT_baseQualities) > 0): + VAF = 1.0 + else: + VAF = 0.0 + + if args.altBQF != '': + scoreString = ",".join([str(score) for score in ALT_baseQualities]) + if scoreString != '': + scoreString = ";".join([scoreString , str(VAF)]) + ALT_baseQualities_file.write("%s\t%s\t%s\n" % (chrom, pos, scoreString)) + + if args.refBQF != '': + scoreString = ",".join([str(score) for score in REF_baseQualities]) + if scoreString != '': + scoreString = ";".join([scoreString , str(VAF)]) + REF_baseQualities_file.write("%s\t%s\t%s\n" % (chrom, pos, scoreString)) + + if args.altPosF != '': + positionsString = ",".join([str(readpos) for readpos in ALT_basePositions]) + if positionsString != '': + ALT_basePositions_file.write("%s\t%s\t%s\n" % (chrom, pos, positionsString)) + + if args.refPosF != '': + positionsString = ",".join([str(readpos) for readpos in REF_basePositions]) + if positionsString != '': + REF_basePositions_file.write("%s\t%s\t%s\n" % (chrom, pos, positionsString)) + + break # only one pileup for a position + + # Calculating duplicates based on read-mate pair's start positions (chr id and start location) + count_duplicate = BoolCounter(REF, ALT) + for key in readMateHash: + value_length = len(readMateHash[key]) + if value_length > 0: + sorted_values = sorted(readMateHash[key], key=lambda x: x[0]) # Sorted based on base quality + sorted_values = sorted_values[:-1] # removing the read with highest quality, so it will be retained for count + for value in sorted_values: # Removing everthing else + qual_value, decreaseInfo = value + remove_base = decreaseInfo[0] + remove_is_reverse = decreaseInfo[1] + + count_duplicate.update(remove_base) + (DP4rf, DP4rr, DP4af, DP4ar) = decreaseDP4(remove_base, remove_is_reverse, REF, ALT, DP4rf, DP4rr, DP4af, DP4ar) + + if (DP4[2] + DP4[3]) > ALTcount: # that the ALTcount is larger happens often due to BAQ during samtools mpileup which doesn't change the base qual in the BAM file, but decreases base qual during calling + if DP4af >= nonREFnonALTfwd: DP4af -= nonREFnonALTfwd + + if DP4ar >= nonREFnonALTrev: DP4ar -= nonREFnonALTrev + + nonREFnonALT = nonREFnonALTrev + nonREFnonALTfwd + count_nonREFnonALT.update_nonREFnonALT(nonREFnonALT) + + #Format: DP2 -> "reference(forward + reverse), alt(forward + reverse)" + supple_dup_str = 'DP2sup=' + ','.join(map(str, count_supple.counted())) + supple_dup_str += ';DP2dup=' + ','.join(map(str, count_duplicate.counted())) + supple_dup_str += ';DP2pairEnd=' + ','.join(map(str, count_PE.counted())) + supple_dup_str += ';DP2mis=' + ','.join(map(str, count_mismatch.counted())) + supple_dup_str += ';DP2nonREFnonALT=' + ','.join(map(str, count_nonREFnonALT.counted())) + + ACGTNacgtn1_string = "ACGTNacgtnPLUS="+",".join([str(i) for i in ACGTNacgtn1]) + ACGTNacgtn2_string = "ACGTNacgtnMINUS="+",".join([str(i) for i in ACGTNacgtn2]) + + info_values[DP4_idx] = "DP4=" + str(DP4rf)+ "," + str(DP4rr)+ "," + str(DP4af)+ "," + str(DP4ar) + info_values.append(ACGTNacgtn1_string) + info_values.append(ACGTNacgtn2_string) + info_values.append(DP4_original) + info_values.append(supple_dup_str) + + entries[header_indices["INFO"]] = ';'.join(info_values) + + sys.stdout.write('\t'.join(entries) + '\n') + else: + sys.stdout.write(line) # write germline and somatic-multiallelic SNVs as is + + del ALT_basePositions + del REF_basePositions + del REF_baseQualities + del ALT_baseQualities + samfile.close() + + if args.altBQF is not None: + ALT_baseQualities_file.close() + + if args.refBQF is not None: + REF_baseQualities_file.close() + + if args.altPosF is not None: + ALT_basePositions_file.close() + + if args.refPosF is not None: + REF_basePositions_file.close() + + #vcfInFile.close() + #outFile.close() + +if __name__ == '__main__': + #print "Starting program...\n" + import argparse + parser = argparse.ArgumentParser() + #parser.add_option('--inf',action='store',type='string',dest='inf',help='Specify the name of the input vcf file containing all snvs (germline and somatic)',default='') + parser.add_argument('--alignmentFile',dest='alignmentFile',help='Specify the name of the BAM file containing bwa alignments, has to be the BAM file that was used to call the variants in the input vcf file - REQUIRED', required=True) + #parser.add_option('--outf',action='store',type='string',dest='outf',help='Specify the name of the output file, which will have same format as input vcf but with PE overlap filtered DP4 values if somatic and if snvs in PE overlap region',default='') + parser.add_argument('--referenceFile', dest='refFileName', help='Specify the full path of reference genome file in fasta format, with index in the same directory') + parser.add_argument('--mapq',type=int,dest='mapq',help='Specify the minimum mapping quality of bwa used for mpileup as parameter -q (default: 30 )',default=30) + parser.add_argument('--baseq',type=int,dest='baseq',help='Specify the minimum base quality scores used for mpileup as parameter -Q (default: 13)',default=13) + parser.add_argument('--qualityScore',dest='qualityScore',help='Specify whether the per base quality score is given in phred or illumina format (default is Illumina score: ASCII offset of 64, while PHRED scores have an ASCII offset of 33)',default='phred') + parser.add_argument('--maxNumberOfMismatchesInRead',type=int,dest='allowedNumberOfMismatches',help='Specify the number of mismatches that are allowed per read in order to consider this read. Value of -1 (default) turns this filter off.',default=-1) + parser.add_argument('--altBaseQualFile',nargs="?",type=argparse.FileType('w'),dest='altBQF',help='Specify the name of the output file for alternative allele base qualities.',default=None) + parser.add_argument('--refBaseQualFile',nargs="?",type=argparse.FileType('w'),dest='refBQF',help='Specify the name of the output file for reference allele base qualities.',default=None) + parser.add_argument('--altBasePositionsFile',nargs="?",type=argparse.FileType('w'),dest='altPosF',help='Specify the name of the output file for position within the reads for alternative bases.',default=None) + parser.add_argument('--refBasePositionsFile',nargs="?",type=argparse.FileType('w'),dest='refPosF',help='Specify the name of the output file for position within the reads for reference bases.',default=None) + parser.add_argument('--nocontrol',action='store_true',dest='no_control',help='Specify the workflow is run without control.',default=False) + parser.add_argument('--configFile',nargs="?",type=argparse.FileType('r'),dest='configfile',help='Specify the config file which contains header names.',default=None) + + + args = parser.parse_args() + performAnalysis(args) + #print "\nProgram successfully terminating...." + diff --git a/resources/analysisTools/snvPipeline/filter_vcf.sh b/resources/analysisTools/snvPipeline/filter_vcf.sh index 0e8f226..fa23694 100755 --- a/resources/analysisTools/snvPipeline/filter_vcf.sh +++ b/resources/analysisTools/snvPipeline/filter_vcf.sh @@ -24,7 +24,7 @@ getRefGenomeAndChrPrefixFromHeader ${TUMOR_BAMFILE_FULLPATH_BP} # Sets CHR_PREFI ALIGNMENT_FOLDER=`dirname ${TUMOR_BAMFILE_FULLPATH_BP}` # bugfix: ensure to interpret CHROMOSOME_INDICES as array - otherwise TOOL_INTERMUTATION_DISTANCE_COORD_COLOR will fail... -declare -a CHROMOSOME_INDICES="${CHROMOSOME_INDICES}" +declare -a CHROMOSOME_INDICES="${CHROMOSOME_INDICES_PLOTTING:-${CHROMOSOME_INDICES}}" numberOfChromosomes=${CHROMOSOME_INDICES[@]} outputFilenamePrefix=${mpileupDirectory}/${SNVFILE_PREFIX}${PID} outputFilenamePrefix_original=${outputFilenamePrefix} diff --git a/resources/analysisTools/snvPipeline/in_dbSNPcounter.pl b/resources/analysisTools/snvPipeline/in_dbSNPcounter.pl index cbf0ef3..c97155b 100755 --- a/resources/analysisTools/snvPipeline/in_dbSNPcounter.pl +++ b/resources/analysisTools/snvPipeline/in_dbSNPcounter.pl @@ -23,7 +23,7 @@ { $minscore = 8; } -open (FH, $file) or die "Could not open $file: $!\n"; +open (my $FH, $file) or die "Could not open $file: $!\n"; # count all, somatic, s. with minconfidence, in dbSNP, in 1KG my $all = 0; @@ -37,85 +37,80 @@ my $header = ""; -while ($header = ) -{ - last if ($header =~ /^\#CHROM/); # that is the line with the column names -} -chomp $header; - -my $DBSBP = 13; -my $KG = 14; -my $CONFIDENCE = 28; -my $CANNOTATION = -1; -my $RECLASSIFICATION = -1; -my $CLASSIFICATION; +my ($DBSBP, $KG, $CONFIDENCE, $CLASSIFICATION); -@help = split(/\t/, $header); -for (my $c = 0; $c < @help; $c++) -{ - # fixed column labels (from vcf_pileup_compare) - if ($help[$c] eq "CONFIDENCE") - { - $CONFIDENCE = $c; - print STDERR "CONFIDENCE in column $c\n"; - } - if ($help[$c] eq "1K_GENOMES") +while(!eof($FH)){ + my $line = readline($FH) || die "Could not read line: $?"; + chomp $line; + if ($line =~ /^#/) { - $KG = $c; - print STDERR "INFO in column $c\n"; - } - if ($help[$c] eq "DBSNP") - { - $DBSBP = $c; - print STDERR "DBSNP_COL in column $c\n"; - } - if ($help[$c] eq "ANNOTATION_control") - { - $CANNOTATION = $c; - print STDERR "ANNOTATION_control in column $c\n"; - } - if ($help[$c] eq "RECLASSIFICATION") - { - $RECLASSIFICATION = $c; - print STDERR "RECLASSIFICATION in column $c\n"; - } -} + #print STDOUT "HELLO\n"; + if ($line =~/^#CHROM/){ + chomp $line; -if ($CANNOTATION > 0) { - $CLASSIFICATION = $CANNOTATION; -} else { - $CLASSIFICATION = $RECLASSIFICATION; -} + my @head=split("\t", $line); + my %col; -# I know that it's evilly hardcoded! -while () -{ - if ($_ =~ /^#/) # skip header lines - { - next; - } - $all++; - @help = split ("\t", $_); - if ($help[$CLASSIFICATION] =~ /somatic/) - { - $somatic++; - if ($help[$CONFIDENCE] >= $minscore) - { - $scored++; - #print STDERR $_; - if ($help[$DBSBP] =~ /MATCH=exact/) + my $i = 0; + while($i < @head) { - $dbSNP++; + if($head[$i] eq "DBSNP"){ + $col{"DBSNP"} = $i; + print STDERR "DBSNP in column ", $i+1,"\n"; + } + if($head[$i] eq "1K_GENOMES"){ + $col{"1K_GENOMES"} = $i; + print STDERR "1K_GENOMES in column ", $i+1,"\n"; + } + if($head[$i] eq "RECLASSIFICATION"){ + $col{"RECLASSIFICATION"} = $i; + print STDERR "RECLASSIFICATION in column ", $i+1,"\n"; + } + if($head[$i] eq "ANNOTATION_control"){ + $col{"ANNOTATION_control"} = $i; + print STDERR "ANNOTATION_control in column ", $i+1,"\n"; + } + if($head[$i] eq "CONFIDENCE"){ + $col{"CONFIDENCE"} = $i; + print STDERR "CONFIDENCE in column ", $i+1,"\n"; + } + $i++; } - if ($help[$KG] =~ /MATCH=exact/) + + $DBSBP = $col{"DBSNP"}; + $KG = $col{"1K_GENOMES"}; + $CONFIDENCE = $col{"CONFIDENCE"}; + + if ($col{"ANNOTATION_control"} > 0) { + $CLASSIFICATION = $col{"ANNOTATION_control"}; + } else { + $CLASSIFICATION = $col{"RECLASSIFICATION"}; + } + } + } + else{ + $all++; + @help = split ("\t", $line); + if ($help[$CLASSIFICATION] =~ /somatic/) + { + $somatic++; + if ($help[$CONFIDENCE] >= $minscore) { - $oneKG++; + $scored++; + if ($help[$DBSBP] =~ /MATCH=exact/) + { + $dbSNP++; + } + if ($help[$KG] =~ /MATCH=exact/) + { + $oneKG++; + } } } } } +close $FH; -close FH; if ($scored > 0) { $perc_dbSNP = sprintf ("%.2f", ($dbSNP/$scored*100)); @@ -126,6 +121,5 @@ else # there might be no ... { die "!!! There are no SNVs with confidence >= $minscore in the input, it makes no sense to do further analyses with the current settings on these data!!!\n"; - } exit; diff --git a/resources/analysisTools/snvPipeline/intermutationDistance_Coord_color.r b/resources/analysisTools/snvPipeline/intermutationDistance_Coord_color.r index e00938f..faa1757 100755 --- a/resources/analysisTools/snvPipeline/intermutationDistance_Coord_color.r +++ b/resources/analysisTools/snvPipeline/intermutationDistance_Coord_color.r @@ -45,6 +45,14 @@ c.array <- unlist(strsplit(opt$chrArray, ',', fixed = TRUE)) data <- read.table(opt$chrLengthFile, header=FALSE) chrLength <- data.frame(data) rownames(chrLength) <- chrLength$V1 + +if(opt$chrPrefix != "" && + grepl(opt$chrPrefix, chrLength$V1[1]) && + !grepl(opt$chrPrefix, c.array[1])){ + + c.array <- paste0(opt$chrPrefix, c.array, opt$chrSuffix) +} + xtotal <- sum(chrLength[c.array,2]/10) xoffsets <- c(0) @@ -84,7 +92,7 @@ pdf(opt$outFilePrefix, width=20, height=8) plotColors = c("CA"="blue","CG"="black","CT"="red","TA"="purple","TC"="orange","TG"="green") baseChanges <- c("CA", "CG", "CT", "TA", "TC","TG") -c.name <- paste0(opt$chrPrefix, c.array[1], opt$chrSuffix) +c.name <- c.array[1] sel <- which(dat$diff > 0) if(length(sel)){ @@ -101,7 +109,7 @@ if(length(sel)){ # chromosomes <- c(2:22, 'X') for (chr in c.array) { - c.name <- paste0(opt$chrPrefix, chr, opt$chrSuffix) + c.name <- chr sel <- which(dat$X.CHROM == c.name & dat$diff > 0) points((dat$POS[sel]/10+xoffset)/xtotal, log10(dat$diff[sel]), col = plotColors[dat$change[sel]], pch=16,cex=0.8) diff --git a/resources/analysisTools/snvPipeline/snvAnnotation.sh b/resources/analysisTools/snvPipeline/snvAnnotation.sh index f7d6d1a..94a4bbe 100755 --- a/resources/analysisTools/snvPipeline/snvAnnotation.sh +++ b/resources/analysisTools/snvPipeline/snvAnnotation.sh @@ -17,7 +17,7 @@ declare -r outputDirectory=`dirname ${FILENAME_VCF_OUT}` relinked=false # (Re-)link bam file and index. The pipeline cannot handle i.e. .mdup.bam and .mdup.bai # Look up the index with .bam.bai. If the file exists nothing happens. -if [[ ! -f ${TUMOR_BAMFILE_FULLPATH_BP}.bai ]] +if [[ ! -f ${TUMOR_BAMFILE_FULLPATH_BP}.bai && ! -f ${TUMOR_BAMFILE_FULLPATH_BP}.crai ]] then shortBamIdx=`dirname ${TUMOR_BAMFILE_FULLPATH_BP}`"/"`basename ${TUMOR_BAMFILE_FULLPATH_BP} .bam`".bai" [[ ! -f ${shortBamIdx} ]] && echo "Bam index file cannot be found!" && exit -15 @@ -91,12 +91,10 @@ fi cmdFilter="${cmdFilter} | ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} --tabix_bin=${TABIX_BINARY} -a - -b ${DBSNP} --columnName=${DBSNP_COL} --reportMatchType --bAdditionalColumn=2 --reportLevel 4 | \ ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} --tabix_bin=${TABIX_BINARY} -a - -b ${KGENOME} --columnName=${KGENOMES_COL} --reportMatchType --bAdditionalColumn=2 --reportLevel 4 | \ - ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} --tabix_bin=${TABIX_BINARY} -a - -b ${ExAC} --columnName ${ExAC_COL} --bFileType vcf --reportMatchType --reportLevel 4 | \ - ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} --tabix_bin=${TABIX_BINARY} -a - -b ${EVS} --columnName ${EVS_COL} --bFileType vcf --reportMatchType --reportLevel 4 | \ - ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} -a - -b ${GNOMAD_WES_ALL_SNV} --columnName=${GNOMAD_WES_COL} --bFileType vcf --reportMatchType --reportLevel 4 | \ - ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} -a - -b ${GNOMAD_WGS_ALL_SNV} --columnName=${GNOMAD_WGS_COL} --bFileType vcf --reportMatchType --reportLevel 4 | \ + ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} --tabix_bin=${TABIX_BINARY} -a - -b ${GNOMAD_WES_ALL_SNV} --columnName=${GNOMAD_WES_COL} --bFileType vcf --reportMatchType --reportLevel 4 | \ + ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} --tabix_bin=${TABIX_BINARY} -a - -b ${GNOMAD_WGS_ALL_SNV} --columnName=${GNOMAD_WGS_COL} --bFileType vcf --reportMatchType --reportLevel 4 | \ ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} --tabix_bin=${TABIX_BINARY} -a - -b ${LOCALCONTROL_WGS} --columnName ${LOCALCONTROL_WGS_COL} --bFileType vcf --reportMatchType --reportLevel 4 | \ - ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} --tabix_bin=${TABIX_BINARY} -a - -b ${LOCALCONTROL_WES} --columnName ${LOCALCONTROL_WES_COL} --bFileType vcf --reportMatchType --reportLevel 4" + ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} --tabix_bin=${TABIX_BINARY} -a - -b ${LOCALCONTROL_WES} --columnName ${LOCALCONTROL_WES_COL} --bFileType vcf --reportMatchType --reportLevel 4 " if [[ -f ${RECURRENCE} ]]; then cmdFilter="${cmdFilter} | ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} -a - -b ${RECURRENCE} --columnName ${RECURRENCE_COL} --bFileType vcf" @@ -199,7 +197,8 @@ mkfifo ${npConfidence} if [[ ${isNoControlWorkflow-false} == "false" ]]; then cat ${filenameSNVVCF} > ${npConfidence} & else - cat ${filenameSNVVCF} | ${PYPY_OR_PYTHON_BINARY} -u ${TOOL_CONFIDENCE_ANNOTATION} ${noControlFlag} -i - ${CONFIDENCE_OPTS} -a 0 > ${npConfidence} & + cat ${filenameSNVVCF} | ${PYPY_OR_PYTHON_BINARY} -u ${TOOL_CONFIDENCE_ANNOTATION} ${noControlFlag} -i - ${CONFIDENCE_OPTS} -a 0 \ + --gnomAD_WGS_maxMAF=${CRIT_GNOMAD_GENOMES_maxMAF} --gnomAD_WES_maxMAF=${CRIT_GNOMAD_EXOMES_maxMAF} --localControl_WGS_maxMAF=${CRIT_LOCALCONTROL_maxMAF} --localControl_WES_maxMAF=${CRIT_LOCALCONTROL_maxMAF} --1000genome_maxMAF=${CRIT_1KGENOMES_maxMAF} > ${npConfidence} & fi # create BaseScore FIFOs and their consumer processes (zip and write to target file) @@ -214,7 +213,8 @@ cat ${filenameReferenceAlleleReadPositions}_NP | ${BGZIP_BINARY} -f >${filenameR if [[ ${runArtifactFilter-true} == true ]] then cat ${npConfidence} | filterPeOverlap ${noControlFlag} --alignmentFile=${tumorbamfullpath} --mapq=$mapqual --baseq=$basequal --qualityScore=phred --maxNumberOfMismatchesInRead=${NUMBER_OF_MISMACTHES_THRESHOLD--1} --altBaseQualFile=${filenameAlternativeAlleleBaseScores}_NP --refBaseQualFile=${filenameReferenceAlleleBaseScores}_NP --altBasePositionsFile=${filenameAlternativeAlleleReadPositions}_NP --refBasePositionsFile ${filenameReferenceAlleleReadPositions}_NP --referenceFile=${REFERENCE_GENOME} \ - | ${PYPY_OR_PYTHON_BINARY} -u ${TOOL_CONFIDENCE_ANNOTATION} ${noControlFlag} -i - ${CONFIDENCE_OPTS} -a 0 -f ${filenameSomaticSNVsTmp} > ${filenameSNVVCFTemp}.tmp + | ${PYPY_OR_PYTHON_BINARY} -u ${TOOL_CONFIDENCE_ANNOTATION} ${noControlFlag} -i - ${CONFIDENCE_OPTS} -a 0 -f ${filenameSomaticSNVsTmp} \ + --gnomAD_WGS_maxMAF=${CRIT_GNOMAD_GENOMES_maxMAF} --gnomAD_WES_maxMAF=${CRIT_GNOMAD_EXOMES_maxMAF} --localControl_WGS_maxMAF=${CRIT_LOCALCONTROL_maxMAF} --localControl_WES_maxMAF=${CRIT_LOCALCONTROL_maxMAF} --1000genome_maxMAF=${CRIT_1KGENOMES_maxMAF} > ${filenameSNVVCFTemp}.tmp [[ $? != 0 ]] && echo "Error in first iteration of confidence annotation" && exit 2 @@ -243,7 +243,8 @@ then fi cat ${filenameSNVVCFTemp} | ${PYTHON_BINARY} ${TOOL_FLAG_BIAS} --vcfFile="/dev/stdin" --referenceFile=${REFERENCE_GENOME} --sequence_specificFile=${filenamePCRerrorMatrix} --sequencing_specificFile=${filenameSequencingErrorMatrix} --numReads=${nReads} --numMuts=${nMuts} --biasPValThreshold=${biasPValThreshold} --biasRatioThreshold=${biasRatioThreshold} --biasRatioMinimum=${biasRatioMinimum} --maxNumOppositeReadsSequencingWeakBias=${maxNumOppositeReadsSequencingWeakBias} --maxNumOppositeReadsSequenceWeakBias=${maxNumOppositeReadsSequenceWeakBias} --maxNumOppositeReadsSequencingStrongBias=${maxNumOppositeReadsSequencingStrongBias} --maxNumOppositeReadsSequenceStrongBias=${maxNumOppositeReadsSequenceStrongBias} --ratioVcf=${rVcf} --bias_matrixSeqFile=${filenameBiasMatrixSeqFile} --bias_matrixSeqingFile=${filenameBiasMatrixSeqingFile} --vcfFileFlagged="/dev/stdout" | \ - ${PYPY_OR_PYTHON_BINARY} -u ${TOOL_CONFIDENCE_ANNOTATION} ${noControlFlag} -i - ${CONFIDENCE_OPTS} -a 1 -f ${filenameSomaticSNVsTmp} > ${filenameSNVVCFTemp}.tmp + ${PYPY_OR_PYTHON_BINARY} -u ${TOOL_CONFIDENCE_ANNOTATION} ${noControlFlag} -i - ${CONFIDENCE_OPTS} -a 1 -f ${filenameSomaticSNVsTmp} \ + --gnomAD_WGS_maxMAF=${CRIT_GNOMAD_GENOMES_maxMAF} --gnomAD_WES_maxMAF=${CRIT_GNOMAD_EXOMES_maxMAF} --localControl_WGS_maxMAF=${CRIT_LOCALCONTROL_maxMAF} --localControl_WES_maxMAF=${CRIT_LOCALCONTROL_maxMAF} --1000genome_maxMAF=${CRIT_1KGENOMES_maxMAF} > ${filenameSNVVCFTemp}.tmp [[ $? != 0 ]] && echo "Error in first filtering and/or second interation of confidence annotation" && exit 5 @@ -267,7 +268,8 @@ then fi cat ${filenameSNVVCFTemp} | ${PYTHON_BINARY} -u ${TOOL_FLAG_BIAS} --vcfFile="/dev/stdin" --referenceFile=${REFERENCE_GENOME} --sequence_specificFile=${filenamePCRerrorMatrix} --sequencing_specificFile=${filenameSequencingErrorMatrix} --numReads=${nReads} --numMuts=${nMuts} --biasPValThreshold=${biasPValThreshold} --biasRatioThreshold=${biasRatioThreshold} --biasRatioMinimum=${biasRatioMinimum} --maxNumOppositeReadsSequencingWeakBias=${maxNumOppositeReadsSequencingWeakBias} --maxNumOppositeReadsSequenceWeakBias=${maxNumOppositeReadsSequenceWeakBias} --maxNumOppositeReadsSequencingStrongBias=${maxNumOppositeReadsSequencingStrongBias} --maxNumOppositeReadsSequenceStrongBias=${maxNumOppositeReadsSequenceStrongBias} --ratioVcf=${rVcf} --bias_matrixSeqFile=${filenameBiasMatrixSeqFile} --bias_matrixSeqingFile=${filenameBiasMatrixSeqingFile} --vcfFileFlagged="/dev/stdout" | \ - ${PYPY_OR_PYTHON_BINARY} -u ${TOOL_CONFIDENCE_ANNOTATION} ${noControlFlag} -i - ${CONFIDENCE_OPTS_PANCAN} -a 2 > ${filenameSNVVCF} + ${PYPY_OR_PYTHON_BINARY} -u ${TOOL_CONFIDENCE_ANNOTATION} ${noControlFlag} -i - ${CONFIDENCE_OPTS_PANCAN} -a 2 \ + --gnomAD_WGS_maxMAF=${CRIT_GNOMAD_GENOMES_maxMAF} --gnomAD_WES_maxMAF=${CRIT_GNOMAD_EXOMES_maxMAF} --localControl_WGS_maxMAF=${CRIT_LOCALCONTROL_maxMAF} --localControl_WES_maxMAF=${CRIT_LOCALCONTROL_maxMAF} --1000genome_maxMAF=${CRIT_1KGENOMES_maxMAF} > ${filenameSNVVCF} [[ $? != 0 ]] && echo "Error in second filtering and/or third iteration of confidence annotation" && exit 8 @@ -289,7 +291,8 @@ then fi else cat ${npConfidence} | filterPeOverlap ${noControlFlag} --alignmentFile=${tumorbamfullpath} --mapq=$mapqual --baseq=$basequal --qualityScore=phred --maxNumberOfMismatchesInRead=${NUMBER_OF_MISMACTHES_THRESHOLD--1} --altBaseQualFile=${filenameAlternativeAlleleBaseScores}_NP --refBaseQualFile=${filenameReferenceAlleleBaseScores}_NP --altBasePositionsFile=${filenameAlternativeAlleleReadPositions}_NP --refBasePositionsFile=${filenameReferenceAlleleReadPositions}_NP --referenceFile=${REFERENCE_GENOME} | \ - ${PYPY_OR_PYTHON_BINARY} -u ${TOOL_CONFIDENCE_ANNOTATION} ${noControlFlag} -i - ${CONFIDENCE_OPTS_PANCAN} > ${filenameSNVVCFTemp} + ${PYPY_OR_PYTHON_BINARY} -u ${TOOL_CONFIDENCE_ANNOTATION} ${noControlFlag} -i - ${CONFIDENCE_OPTS_PANCAN} \ + --gnomAD_WGS_maxMAF=${CRIT_GNOMAD_GENOMES_maxMAF} --gnomAD_WES_maxMAF=${CRIT_GNOMAD_EXOMES_maxMAF} --localControl_WGS_maxMAF=${CRIT_LOCALCONTROL_maxMAF} --localControl_WES_maxMAF=${CRIT_LOCALCONTROL_maxMAF} --1000genome_maxMAF=${CRIT_1KGENOMES_maxMAF} > ${filenameSNVVCFTemp} exitCode=$? [[ $exitCode == 0 ]] && [[ -f ${filenameSNVVCFTemp} ]] && mv ${filenameSNVVCFTemp} ${filenameSNVVCF} diff --git a/resources/analysisTools/snvPipeline/snvCalling.sh b/resources/analysisTools/snvPipeline/snvCalling.sh index de88566..fc7d668 100755 --- a/resources/analysisTools/snvPipeline/snvCalling.sh +++ b/resources/analysisTools/snvPipeline/snvCalling.sh @@ -30,7 +30,12 @@ getRefGenomeAndChrPrefixFromHeader ${TUMOR_BAMFILE_FULLPATH_BP} # Sets CHR_PREFI chr="" if [[ -n "$PARM_CHR_INDEX" ]] then - chr=${CHR_PREFIX}${PARM_CHR_INDEX}${CHR_SUFFIX} + if [[ "$PARM_CHR_INDEX" == "ALT" || "$PARM_CHR_INDEX" == "HLA" ]] + then + chr=${PARM_CHR_INDEX} + else + chr=${CHR_PREFIX}${PARM_CHR_INDEX}${CHR_SUFFIX} + fi else chr=${CHR_PREFIX}${CHROMOSOME_INDICES[$((RODDY_JOBARRAYINDEX-1))]}${CHR_SUFFIX} # File names can contain X and Y. The pattern contains RODDY_JOBARRAYINDEX so this has to be changed temporary as the value is always numeric! @@ -66,8 +71,23 @@ filenameMPileupError="${MPILEUP_SUBDIR}/${MPILEUPOUT_PREFIX}${PID}.${chr}.error" FILENAME_VCF_SNVS_TEMP=${FILENAME_VCF_SNVS}.tmp -#[[ ! -f $filenameMPileupTemp ]] && -${BCFTOOLS_BINARY} mpileup ${MPILEUP_OPTS} -r ${chr} -f ${REFERENCE_GENOME} $tumorbamfullpath | ${BCFTOOLS_BINARY} call ${BCFTOOLS_OPTS} > $filenameMPileupTemp +## For variant calling from ALT and HLA region in the GRCh38 genome +# The *_CONTIGS_FILE contains the list of the contigs names +MPILEUP_REGION_OPTION="" +if [[ "$chr" == "HLA" ]] +then + MPILEUP_REGION_OPTION="-R ${CHROMOSOME_HLA_CONTIGS_FILE}" + MPILEUP_OPTS=$MPILEUP_OPTS_ALT_HLA +elif [[ "$chr" == "ALT" ]] +then + MPILEUP_REGION_OPTION="-R ${CHROMOSOME_ALT_CONTIGS_FILE}" + MPILEUP_OPTS=$MPILEUP_OPTS_ALT_HLA +else + MPILEUP_REGION_OPTION="-r ${chr}" +fi + +#[[ ! -f $filenameMPileupTemp ]] && +${BCFTOOLS_BINARY} mpileup ${MPILEUP_OPTS} ${MPILEUP_REGION_OPTION} -f ${REFERENCE_GENOME} $tumorbamfullpath | ${BCFTOOLS_BINARY} call ${BCFTOOLS_OPTS} > $filenameMPileupTemp if [[ "$?" != 0 ]] then @@ -86,7 +106,7 @@ firstLineVCF=`cat $filenameMPileupTemp | grep -v "^#" | head -n1` if [[ -n "$firstLineVCF" ]]; then # filter out mutations with strand bias next to the motif GGnnGG (or CCnnCC if snv is on reverse strand) ${PERL_BINARY} "$TOOL_SEQ_CONTEXT_ANNOTATOR" "$FASTAFROMBED_BINARY" "$filenameMPileupTemp" "$REFERENCE_GENOME" 10 "$TOOLS_DIR" | \ - ${PYTHON_BINARY} "$TOOL_RAW_SNV_FILTER" --outf="$snvOut" "$RAW_SNV_FILTER_OPTIONS" # --inf=$MPILEUP_SUBDIR/forStrandBiasFilter.${chr}.bed + ${PYTHON_BINARY} "$TOOL_RAW_SNV_FILTER" --outf="$snvOut" ${RAW_SNV_FILTER_OPTIONS} # --inf=$MPILEUP_SUBDIR/forStrandBiasFilter.${chr}.bed if [[ "$?" == 0 ]] then @@ -103,7 +123,7 @@ if [[ -n "$firstLineVCF" ]]; then mkfifo $NP_MPILEUP # if there is a germline BAM, first look up these positions in the control file by just piling up the bases, this is NO SNV calling - ${SAMTOOLS_BINARY} mpileup ${MPILEUPCONTROL_OPTS} -r ${chr} -l ${filenameMPileupOut} -f ${REFERENCE_GENOME} ${CONTROL_BAMFILE_FULLPATH_BP} > ${NP_MPILEUP} & + ${SAMTOOLS_BINARY} mpileup ${MPILEUPCONTROL_OPTS} -l ${filenameMPileupOut} -f ${REFERENCE_GENOME} ${CONTROL_BAMFILE_FULLPATH_BP} > ${NP_MPILEUP} & ${PERL_BINARY} ${TOOL_VCF_PILEUP_COMPARE} ${filenameMPileupOut} $NP_MPILEUP "Header" > ${FILENAME_VCF_SNVS_TEMP} rm $NP_MPILEUP diff --git a/resources/analysisTools/snvPipeline/snv_extractor_v1.pl b/resources/analysisTools/snvPipeline/snv_extractor_v1.pl index cea35fa..d880b2e 100755 --- a/resources/analysisTools/snvPipeline/snv_extractor_v1.pl +++ b/resources/analysisTools/snvPipeline/snv_extractor_v1.pl @@ -64,18 +64,17 @@ print STDERR "NOTE: The reading of the VCF will end with a broken pipe error, because we only read the header and then stop.\n"; close IN; -my @head=split("\t", $head); +my @header=split("\t", $head); my %col; - my $i = 0; -while($i < @head) +while($i < @header) { - if($head[$i] eq "EXONIC_CLASSIFICATION"){$col{"EXONIC_CLASSIFICATION"} = $i; print "EXONIC_CLASSIFICATION in column ", $i+1,"\n";} - if($head[$i] eq "ANNOVAR_FUNCTION"){$col{"ANNOVAR_FUNCTION"} = $i; print "ANNOVAR_FUNCTION in column ", $i+1,"\n";} - if($head[$i] eq "RECLASSIFICATION"){$col{"RECLASSIFICATION"} = $i; print "RECLASSIFICATION in column ", $i+1,"\n";} - if($head[$i] eq "ANNOTATION_control"){$col{"ANNOTATION_control"} = $i; print "ANNOTATION_control in column ", $i+1,"\n";} - if($head[$i] eq "CONFIDENCE"){$col{"CONFIDENCE"} = $i; print "CONFIDENCE in column ", $i+1,"\n";} + if($header[$i] eq "EXONIC_CLASSIFICATION"){$col{"EXONIC_CLASSIFICATION"} = $i; print "EXONIC_CLASSIFICATION in column ", $i+1,"\n";} + if($header[$i] eq "ANNOVAR_FUNCTION"){$col{"ANNOVAR_FUNCTION"} = $i; print "ANNOVAR_FUNCTION in column ", $i+1,"\n";} + if($header[$i] eq "RECLASSIFICATION"){$col{"RECLASSIFICATION"} = $i; print "RECLASSIFICATION in column ", $i+1,"\n";} + if($header[$i] eq "ANNOTATION_control"){$col{"ANNOTATION_control"} = $i; print "ANNOTATION_control in column ", $i+1,"\n";} + if($header[$i] eq "CONFIDENCE"){$col{"CONFIDENCE"} = $i; print "CONFIDENCE in column ", $i+1,"\n";} $i++; } diff --git a/resources/analysisTools/snvPipeline/snvsPerChromPlot.r b/resources/analysisTools/snvPipeline/snvsPerChromPlot.r index 3e6c4cb..b4f3627 100755 --- a/resources/analysisTools/snvPipeline/snvsPerChromPlot.r +++ b/resources/analysisTools/snvPipeline/snvsPerChromPlot.r @@ -48,6 +48,7 @@ dat$chromosome <- factor(dat$chromosome, levels = paste0("chr",c(seq(1,22),"X"," chromLength = read.table(file = opt$chromLengthFile, header = F) +chromLength$V1 = gsub("chr", "", chromLength$V1) rownames(chromLength) = paste0("chr",chromLength$V1) chromLength = chromLength[,2, drop= F] chromLengthMB = round(chromLength/1000000) diff --git a/resources/analysisTools/snvPipeline/tripletBased_BQDistribution_plotter.R b/resources/analysisTools/snvPipeline/tripletBased_BQDistribution_plotter.R index d57fcb5..07b6bff 100755 --- a/resources/analysisTools/snvPipeline/tripletBased_BQDistribution_plotter.R +++ b/resources/analysisTools/snvPipeline/tripletBased_BQDistribution_plotter.R @@ -474,7 +474,7 @@ plot_BQD_to_pdf = function(PDF_OUTPUT_FILE, data.bq.triplet, molten.alt$nBases=transitionSubset[molten.alt$sample,"nBQ.alt"] molten.alt[,ReadPositionsQuantile_ColName] = sapply(transitionSubset[molten.alt$sample,"RP_string.alt"], function(positionsString) { positions = as.integer(unlist(strsplit(positionsString, ","))) - return (round(quantile(positions, ReadPositionQuantile))) + return (round(quantile(positions, ReadPositionQuantile, na.rm = TRUE))) }) # molten.altReadPos = melt(baseScores.alt.counts.normalized.cumul, id.vars = "sample") if (whatToPlot == "BQD_sampleIndividual_ColoredByChromosome") { diff --git a/resources/analysisTools/snvPipeline/vcf_pileup_compare_allin1_basecount.pl b/resources/analysisTools/snvPipeline/vcf_pileup_compare_allin1_basecount.pl index e455d0a..7700ab8 100755 --- a/resources/analysisTools/snvPipeline/vcf_pileup_compare_allin1_basecount.pl +++ b/resources/analysisTools/snvPipeline/vcf_pileup_compare_allin1_basecount.pl @@ -136,6 +136,9 @@ my $ccoord = -1; # make sure that ccoord in first iteration is smaller than tcoord my $tcoord = 0; +my $current_c_chr = ""; +my $current_t_chr = ""; + # one file is longer than the other, nevermind which #(if we did pileup for only the tumor SNV positions, control will be shorter; otherwise longer) TUMORFILE_LOOP: while ($lineT=) @@ -150,7 +153,9 @@ chomp $lineT; @tum = split (/\s+/, $lineT); $tcoord = $tum[1]; # start coordinate - if ($tcoord < $ccoord) # no matching control line => tumor position not covered in control (never true in 1st iteration) + $current_t_chr = $tum[0]; + + if ($tcoord < $ccoord && $current_t_chr eq $current_c_chr) # no matching control line => tumor position not covered in control (never true in 1st iteration) { $missingC++; # print the tumor line with according "flag" @@ -158,7 +163,7 @@ print $lineT, "\tDP=0;DP5=0,0,0,0,0;DP5all=0,0,0,0,0;ACGTNacgtnHQ=0,0,0,0,0,0,0,0,0,0;ACGTNacgtn=0,0,0,0,0,0,0,0,0,0;VAF=0;TSR=0\t$notcovered\n"; next; # read next line from tumor } - if ($tcoord == $ccoord) # matching pair found! + if ($tcoord == $ccoord && $current_t_chr eq $current_c_chr) # matching pair found! { $match++; chomp $lineC; @@ -176,15 +181,16 @@ $ctrC++; # split lines @ctrl = split (/\s+/, $lineC); + $current_c_chr = $ctrl[0]; $ccoord = $ctrl[1]; - if ($tcoord == $ccoord) # matching pair found! + if ($tcoord == $ccoord && $current_t_chr eq $current_c_chr) # matching pair found! { $match++; # do the evaluation in a subroutine evaluate_pos(); next TUMORFILE_LOOP; # and read next line from tumor } - if ($tcoord < $ccoord) # new ccoord is higher than tcoord => tumor position not covered in control (should never be the case!) + if ($tcoord < $ccoord && $current_t_chr eq $current_c_chr) # new ccoord is higher than tcoord => tumor position not covered in control (should never be the case!) { $missingC++; print $lineT, "\tDP=0;DP5=0,0,0,0,0;DP5all=0,0,0,0,0;ACGTNacgtnHQ=0,0,0,0,0,0,0,0,0,0;ACGTNacgtn=0,0,0,0,0,0,0,0,0,0;VAF=0;TSR=0\t$notcovered\n"; diff --git a/resources/configurationFiles/analysisSNVCalling.xml b/resources/configurationFiles/analysisSNVCalling.xml index 68986cf..c337644 100755 --- a/resources/configurationFiles/analysisSNVCalling.xml +++ b/resources/configurationFiles/analysisSNVCalling.xml @@ -35,25 +35,23 @@ + + + - - + + + - - - - - - @@ -76,8 +74,6 @@ - - @@ -87,13 +83,11 @@ - - - + + + @@ -206,8 +202,8 @@ - - + + @@ -233,10 +229,10 @@ - - - - + + + + @@ -249,10 +245,10 @@ - - - - + + + + @@ -265,7 +261,7 @@ - + diff --git a/resources/configurationFiles/analysisSNVCallingGRCh38.xml b/resources/configurationFiles/analysisSNVCallingGRCh38.xml new file mode 100644 index 0000000..96d2dcf --- /dev/null +++ b/resources/configurationFiles/analysisSNVCallingGRCh38.xml @@ -0,0 +1,72 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +