From 2dbc4f1c68cf4992b5a5409b413f3d8c3a95c456 Mon Sep 17 00:00:00 2001 From: Torsten Seemann Date: Fri, 18 Oct 2019 16:57:03 +1100 Subject: [PATCH] Add '-E max_len' to 'seqtk seq' --- Makefile | 2 +- seqtk.c | 9 ++++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 0a60ccc..c98e918 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ CC=gcc -CFLAGS=-g -Wall -O2 -Wno-unused-function +CFLAGS=-g -Wall -Ofast -Wno-unused-function BINDIR=/usr/local/bin all:seqtk diff --git a/seqtk.c b/seqtk.c index fa64393..b3b047b 100644 --- a/seqtk.c +++ b/seqtk.c @@ -1215,14 +1215,14 @@ int stk_seq(int argc, char *argv[]) { gzFile fp; kseq_t *seq; - int c, qual_thres = 0, flag = 0, qual_shift = 33, mask_chr = 0, min_len = 0, max_q = 255, fake_qual = -1; + int c, qual_thres = 0, flag = 0, qual_shift = 33, mask_chr = 0, min_len = 0, max_len = INT_MAX, max_q = 255, fake_qual = -1; unsigned i, line_len = 0; int64_t n_seqs = 0; double frac = 1.; khash_t(reg) *h = 0; krand_t *kr = 0; - while ((c = getopt(argc, argv, "N12q:l:Q:aACrn:s:f:M:L:cVUX:SF:")) >= 0) { + while ((c = getopt(argc, argv, "N12q:l:Q:aACrn:s:f:M:L:E:cVUX:SF:")) >= 0) { switch (c) { case 'a': case 'A': flag |= 1; break; @@ -1242,6 +1242,7 @@ int stk_seq(int argc, char *argv[]) case 'X': max_q = atoi(optarg); break; case 'l': line_len = atoi(optarg); break; case 'L': min_len = atoi(optarg); break; + case 'E': max_len = atoi(optarg); break; case 's': kr = kr_srand(atol(optarg)); break; case 'f': frac = atof(optarg); break; case 'F': fake_qual = *optarg; break; @@ -1259,7 +1260,8 @@ int stk_seq(int argc, char *argv[]) fprintf(stderr, " -s INT random seed (effective with -f) [11]\n"); fprintf(stderr, " -f FLOAT sample FLOAT fraction of sequences [1]\n"); fprintf(stderr, " -M FILE mask regions in BED or name list FILE [null]\n"); - fprintf(stderr, " -L INT drop sequences with length shorter than INT [0]\n"); + fprintf(stderr, " -L INT drop sequences with length shorter than INT [%d]\n", min_len); + fprintf(stderr, " -E INT drop sequences with length longer than INT [%d]\n", max_len); fprintf(stderr, " -F CHAR fake FASTQ quality []\n"); fprintf(stderr, " -c mask complement region (effective with -M)\n"); fprintf(stderr, " -r reverse complement\n"); @@ -1286,6 +1288,7 @@ int stk_seq(int argc, char *argv[]) while (kseq_read(seq) >= 0) { ++n_seqs; if (seq->seq.l < min_len) continue; // NB: length filter before taking random + if (seq->seq.l > max_len) continue; // NB: length filter before taking random if (frac < 1. && kr_drand(kr) >= frac) continue; if (flag & 48) { // then choose odd/even reads only if ((flag&16) && (n_seqs&1) == 0) continue;