Skip to content
Snippets Groups Projects
Commit 35d0f5c7 authored by CJHerrmann's avatar CJHerrmann
Browse files

adapted for slurm, allow 1 mismatch in last 4nt

parent 90c6ef59
No related branches found
No related tags found
No related merge requests found
configfile: "Aseq_config.yaml"
from snakemake.utils import makedirs
from snakemake.utils import listfiles
......@@ -21,14 +19,14 @@ rule finish:
snakemake was run locally.
'''
input: config["results_dir"] + "/counts/annotation_overlap/clusters.summary.tsv"
params:
logdir = config["log_dir"]
run:
#params:
# logdir = config["log_dir"]
#run:
# check if cluster_logs dir for first sample is empty
# if yes, delete the cluster_logs dir
file_list = os.listdir("{log}/cluster_logs/".format(log=params.logdir) )
if len(file_list) == 0:
shell("rm -r {params.logdir}/cluster_logs")
#file_list = os.listdir("{log}/cluster_logs/".format(log=params.logdir) )
#if len(file_list) == 0:
# shell("rm -r {params.logdir}/cluster_logs")
################################################################################
# chronological processing of the reads
......@@ -37,6 +35,7 @@ rule finish:
#-------------------------------------------------------------------------------
# create dir for logfiles
#-------------------------------------------------------------------------------
cluster_logs = os.path.join( config["log_dir"], "cluster_logs")
rule create_log_dir:
## LOCAL ##
......@@ -47,7 +46,7 @@ rule create_log_dir:
output:
temp( config["results_dir"] + "/created_log.tmp" )
run:
makedirs( [config["log_dir"], os.path.join( config["log_dir"], "cluster_logs") ] )
makedirs( [config["log_dir"], cluster_logs] )
shell("touch {output}")
#-------------------------------------------------------------------------------
......@@ -75,7 +74,7 @@ rule fastq2fasta:
config["results_dir"] + "/created_log.tmp"
output: temp(config["results_dir"] + "/{sample}.fa.gz")
params:
cluster_log = lambda wildcards: config["log_dir"] + "/cluster_logs/" + wildcards.sample + ".log"
cluster_log = cluster_logs
log:
config["log_dir"] + "/fastq2fasta.{sample}.log"
shell:
......@@ -116,7 +115,7 @@ rule select_for_valid_5p_configuration:
output: temp(config["results_dir"] + "/{sample}.5ptrimmed.UMI_available.fa.gz")
params:
adapt=config['read_5p_start'],
cluster_log = lambda wildcards: config["log_dir"] + "/cluster_logs/" + wildcards.sample + ".log"
cluster_log = cluster_logs
log:
config["log_dir"] + "/filter_5p_read_start.{sample}.log"
shell:
......@@ -160,9 +159,9 @@ rule collapse_full_UMI_duplicates:
input: config["results_dir"] + "/{sample}.5ptrimmed.UMI_available.fa.gz"
output: temp( config["results_dir"] + "/{sample}.UMI_dupl_removed.fa.gz" )
params:
cluster_log = lambda wildcards: config["log_dir"] + "/cluster_logs/" + wildcards.sample + ".log"
cluster_log = cluster_logs
resources:
mem = 10
mem = 20
log:
config["log_dir"] + "/collapse_full_UMI_duplicates.{sample}.log"
shell:
......@@ -208,7 +207,7 @@ rule trim_3p_adapter:
params:
adapt=config['3p_adapter'],
minLen=config['min_length'],
cluster_log = lambda wildcards: config["log_dir"] + "/cluster_logs/" + wildcards.sample + ".log"
cluster_log = cluster_logs
log:
config["log_dir"] + "/trim_3p_adapter.{sample}.log"
shell:
......@@ -261,7 +260,7 @@ rule get_valid_reads:
input:
config["results_dir"] + "/{sample}.trimmed.UMI.fa.gz"
output:
temp( config["results_dir"] + "/{sample}.valid.trimmed.UMI.fa.gz" )
config["results_dir"] + "/{sample}.valid.trimmed.UMI.fa.gz"
params:
maxN = config['maxN'],
maxAcontent = config['maxAcontent']
......@@ -308,16 +307,16 @@ rule valid_read_cnt:
rule create_STAR_index:
output:
config['STAR_idx'] + "/SAindex"
config['STAR_idx'] + str(config['read_length']) + "/SAindex"
params:
read_len = lambda wildcards: int(config['read_length']) - 7 - 1,
genome = config['genome'],
gtf = config['gene_annotation'],
idx_dir = config['STAR_idx'],
cluster_log = config["log_dir"] + "/cluster_logs/creat_STAR_index.log"
idx_dir = config['STAR_idx'] + str(config['read_length']),
cluster_log = cluster_logs
threads: config["threads.STAR"]
resources:
mem = lambda wildcards: int(np.ceil( config['STAR.max.RAM'] / config['threads.STAR'] ) )
mem = config['STAR.max.RAM']
log:
config["log_dir"] + "/create_STAR_index.log"
shell:
......@@ -342,16 +341,16 @@ rule create_STAR_index:
rule mapping:
input:
config["results_dir"] + "/{sample}.valid.trimmed.UMI.fa.gz",
config['STAR_idx'] + "/SAindex"
config['STAR_idx'] + str(config['read_length']) + "/SAindex"
output:
config["results_dir"] + "/{sample}.STAR_out/Aligned.sortedByCoord.out.bam"
params:
anno = config['gene_annotation'],
outdir = config["results_dir"] + "/{sample}.STAR_out/",
idx_dir = config['STAR_idx'],
cluster_log = lambda wildcards: config["log_dir"] + "/cluster_logs/" + wildcards.sample + ".log"
idx_dir = config['STAR_idx'] + str(config['read_length']),
cluster_log = cluster_logs
resources:
mem = lambda wildcards: int(np.ceil( config['STAR.max.RAM'] / config['threads.STAR'] ) )
mem = config['STAR.max.RAM']
threads: config['threads.STAR']
log:
config["log_dir"] + "/mapping.{sample}.log"
......@@ -387,9 +386,9 @@ rule bam2bed:
input:
config["results_dir"] + "/{sample}.STAR_out/Aligned.sortedByCoord.out.bam"
output:
temp( config["results_dir"] + "/{sample}.reads.bed.gz" )
config["results_dir"] + "/{sample}.reads.bed.gz"
params:
cluster_log = lambda wildcards: config["log_dir"] + "/cluster_logs/" + wildcards.sample + ".log"
cluster_log = cluster_logs
resources:
mem=5
threads: config['threads.bam2bed']
......@@ -412,9 +411,9 @@ rule select_by_edit_distance:
input:
config["results_dir"] + "/{sample}.reads.bed.gz"
output:
temp( config["results_dir"] + "/{sample}.reads.edit_distance_filtered.bed.gz" )
config["results_dir"] + "/{sample}.reads.edit_distance_filtered.bed.gz"
params:
cluster_log = lambda wildcards: config["log_dir"] + "/cluster_logs/" + wildcards.sample + ".log"
cluster_log = cluster_logs
resources:
mem = 10
log:
......@@ -434,10 +433,10 @@ rule collapse_mapped_UMI_duplicates:
input:
config["results_dir"] + "/{sample}.reads.edit_distance_filtered.bed.gz"
output:
temp(config["results_dir"] + "/{sample}.reads.collapsed.bed.gz"),
config["results_dir"] + "/{sample}.reads.collapsed.bed.gz",
temp(config["results_dir"] + "/counts/{sample}.mapped.nr.out")
params:
cluster_log = lambda wildcards: config["log_dir"] + "/cluster_logs/" + wildcards.sample + ".log"
cluster_log = cluster_logs
log:
config["log_dir"] + "/collapse_mapped_UMI_duplicates.{sample}.log"
shell:
......@@ -481,11 +480,11 @@ rule get_3p_ends:
input:
config["results_dir"] + "/{sample}.reads.collapsed.bed.gz"
output:
temp( config["results_dir"] + "/{sample}.3pSites.bed.gz")
config["results_dir"] + "/{sample}.3pSites.bed.gz"
params:
exclude_chr=lambda wildcards: "--exclude=" + ":".join(config['excluded_chr']) if config['excluded_chr'] is not None else "",
min_align = config['min_3p_align'],
cluster_log = lambda wildcards: config["log_dir"] + "/cluster_logs/" + wildcards.sample + ".log"
cluster_log = cluster_logs
resources:
mem = 10
log:
......@@ -536,7 +535,7 @@ rule fetch_flanking_seqs:
upstream_ext = config['upstream_region.IP'],
downstream_ext = config['downstream_region.IP'],
genome = config["genome"],
cluster_log = lambda wildcards: config["log_dir"] + "/cluster_logs/" + wildcards.sample + ".log"
cluster_log = cluster_logs
resources:
mem = 10
log:
......@@ -566,7 +565,7 @@ rule assign_IP_sites:
tot_As = config['total_As'],
consec_As = config['consecutive_As'],
ds_patterns = lambda wildcards: " ".join(["--ds_pattern=%s" % pat for pat in config['downstream_patterns'] ]) if config['downstream_patterns'] is not None else "",
cluster_log = lambda wildcards: config["log_dir"] + "/cluster_logs/" + wildcards.sample + ".log"
cluster_log = cluster_logs
log:
config["log_dir"] + "/assign_IP_sites.{sample}.log"
shell:
......@@ -641,7 +640,7 @@ rule sites_feature_overlap_stats:
params:
gtf = config['gene_annotation'],
utr_name = config['utr_name'],
cluster_log = lambda wildcards: config["log_dir"] + "/cluster_logs/" + wildcards.sample + ".log"
cluster_log = cluster_logs
log:
config["log_dir"] + "/sites_feature_overlap_stats.{sample}.log"
shell:
......@@ -665,7 +664,7 @@ rule pool_samples:
output:
config["results_dir"] + "/3pSites.tsv.gz"
params:
cluster_log = config["log_dir"] + "/cluster_logs/clustering_process.log"
cluster_log = cluster_logs
resources:
mem = lambda wildcards: int( 1.5 * len(config['samples']) )
log:
......@@ -734,7 +733,9 @@ rule assign_polyA_signals:
params:
signals = lambda wildcards: " ".join(["--motif=%s" % sig for sig in config['polyA.signals'] ]),
genome = config['genome'],
cluster_log = config["log_dir"] + "/cluster_logs/clustering_process.log"
cluster_log = cluster_logs
resources:
mem = 20
log:
config["log_dir"] + "/assign_polyA_signals.log"
shell:
......@@ -758,12 +759,12 @@ rule sample_specific_bg:
input:
config["results_dir"] + "/3pSites.PAS.tsv.gz"
output:
temp( config["results_dir"] + "/filteredSites/{sample}.filtered.tsv" )
config["results_dir"] + "/filteredSites/{sample}.filtered.tsv"
params:
cutoff = config['polyAsignal_cutoff'],
upstream_reg = config['upstream_bg_clusters'],
downstream_reg = config['downstream_bg_clusters'],
cluster_log = lambda wildcards: config["log_dir"] + "/cluster_logs/" + wildcards.sample + ".log"
cluster_log = cluster_logs
resources:
mem = 10
log:
......@@ -790,7 +791,7 @@ rule create_noBG_3pSites_table:
output:
table_adjusted = temp(config["results_dir"] + "/3pSites.PAS.filtered.tsv.gz")
params:
cluster_log = config["log_dir"] + "/cluster_logs/create_noBG_3pSites_table.log"
cluster_log = cluster_logs
resources:
mem = 10
log:
......@@ -807,7 +808,7 @@ rule delete_zero_rows:
input:
config["results_dir"] + "/3pSites.PAS.filtered.tsv.gz"
output:
temp( config["results_dir"] + "/3pSites.PAS.noBG.tsv.gz" )
config["results_dir"] + "/3pSites.PAS.noBG.tsv.gz"
log:
config["log_dir"] + "/delete_zero_rows.log"
run:
......@@ -833,11 +834,11 @@ rule cluster_sites:
input:
config["results_dir"] + "/3pSites.PAS.noBG.tsv.gz"
output:
temp( config["results_dir"] + "/clusters.primary.tsv.gz" )
config["results_dir"] + "/clusters.primary.tsv.gz"
params:
upstream_ext = config['upstream_clusters'],
downstream_ext = config['downstream_clusters'],
cluster_log = config["log_dir"] + "/cluster_logs/clustering_process.log"
cluster_log = cluster_logs
log:
config["log_dir"] + "/cluster_sites.log"
shell:
......@@ -868,7 +869,7 @@ rule merge_clusters:
params:
maxsize = config['max_cluster_size'],
minDistToPAS = config['min_dist_to_PAS'],
cluster_log = config["log_dir"] + "/cluster_logs/clustering_process.log"
cluster_log = cluster_logs
log:
config["log_dir"] + "/merge_clusters.log"
shell:
......@@ -890,7 +891,7 @@ rule clusters_to_bed:
output:
config["results_dir"] + "/clusters.merged.bed"
params:
cluster_log = config["log_dir"] + "/cluster_logs/clustering_process.log"
cluster_log = cluster_logs
log:
config["log_dir"] + "/clusters_to_bed.log"
shell:
......@@ -941,7 +942,7 @@ rule clusters_feature_overlap_stats:
params:
gtf = config['gene_annotation'],
utr_name = config['utr_name'],
cluster_log = config["log_dir"] + "/cluster_logs/clustering_process.log"
cluster_log = cluster_logs
log:
config["log_dir"] + "/clusters_feature_overlap_stats.log"
shell:
......
{
"__default__":
{
"threads":"1",
"mem":"2G",
"time":"01:00:00"
"queue":"6hours",
"threads":"1",
"mem":"4G",
"time":"01:00:00",
"name":"{rule}.{wildcards}",
"out":"{params.cluster_log}/{rule}.{wildcards}-%j-%N.out"
},
"collapse_full_UMI_duplicates":
{
......@@ -12,12 +15,14 @@
"create_STAR_index":
{
"threads": "{threads}",
"mem": "{resources.mem}G"
"mem": "{resources.mem}G",
"time": "06:00:00"
},
"mapping":
{
"threads": "{threads}",
"mem": "{resources.mem}G"
"mem": "{resources.mem}G",
"time": "06:00:00"
},
"select_by_edit_distance":
{
......@@ -36,6 +41,15 @@
{
"mem": "{resources.mem}G"
},
"pool_samples":
{
"mem": "{resources.mem}G"
},
"assign_polyA_signals":
{
"time":"06:00:00",
"mem": "{resources.mem}G"
},
"sample_specific_bg":
{
"mem": "{resources.mem}G"
......
---
################################################################################
# directories
################################################################################
# directory, where is/are the input fastq file(s) located
input_dir: input
# directory, in which the pipeline writes all output files (relative to current directory)
# (this also includes the summary files)
results_dir: output
# directory to write the log-files, created during processing, to
log_dir: logs
################################################################################
# sample(s) information
################################################################################
# name(s) of the input fastq file(s)
# if the sample is named: sample1.fq.gz:
# please type here: sample1
# this name is used throughout the entire pipeline as name for the output samples
samples:
-
# read length (i.e. number of sequencing cycles)
# how long are the reads of this study?
read_length: 63
################################################################################
# genome information
################################################################################
# after cloning, the "resource" directory only contains an Ensembl gtf file
# of the human genome
gene_annotation: resources/Homo_sapiens.GRCh38.87.gtf
# Have a look in the README file if you want or need to use a different gtf
# indicate the name 3' UTRs are labeled with in the gtf file
# note: in case 5' and 3' UTRs are labeled with one name:
# summaries with the fraction of sites that overlap with UTR features
# might include overlaps with 5' UTRs (however, this fractions is
# expected to be small)
utr_name: three_prime_utr
# genomic sequences (again, consult the README to find the command that
# allows to download the genome in fasta format from Ensembl)
# after downloading, enter the path and the file name here
genome: resources/Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa
################################################################################
# read_preprocessing
################################################################################
#-------------------------------------------------------------------------------
# adapter trimming
#-------------------------------------------------------------------------------
# the 5' configuration reads are expected to start with
# note: "." characters represent any random nucleotide
# note: The last three nucleotides are required to be three T! The number
# of prior random nucleotides is adjustable according to the used
# setup
read_5p_start: ....TTT
# the 3' adapter that is trimmed from the 3' end of reads
3p_adapter: TGGAATTCTCGGGTGCCAAGG
# the minimum read length, each read must have after adapter trimming (5' and 3')
# (shorter reads are discarded)
min_length: 15
#-------------------------------------------------------------------------------
# valid read parameters
#-------------------------------------------------------------------------------
# reads are considered as valid based on the following measures
# maximum number of Ns per read/sequence
maxN: 2
# maximum fraction of As per read/sequence
maxAcontent: 0.8
# last nucleotide is no A (no paramter value necessary here)
################################################################################
# mapping to 3' end site inference
################################################################################
#-------------------------------------------------------------------------------
# STAR mapping
#-------------------------------------------------------------------------------
# the directory in which the STAR genome index will be stored
# the first run of the pipeline will generate the index automatically
# and will store it at the provided directory
STAR_idx: resources/Homo_sapiens.GRCh38.index
# STAR requires some memory to work efficiently
# if you plan to run the pipeline on a cluster, the STAR.max.RAM
# submits the indexing and mapping job with this number of GB of RAM
STAR.max.RAM: 50
# STAR is much faster when it is run in parallel mode
# set here the number of available cores STAR is allowed to run on
threads.STAR: 12
#-------------------------------------------------------------------------------
# convert BAM to BED
#-------------------------------------------------------------------------------
# number of threads used for this task
threads.bam2bed: 8
#-------------------------------------------------------------------------------
# get initial raw 3' end processing sites
#-------------------------------------------------------------------------------
# The last x nucleotides at the 3' end of each read should map perfectly to
# the genomic sequence. This ensures that true 3' end processing sites
# are inferrred.
# New: allow 1 mismatch/insertion/deletion in last x nucleotides
# Set x here:
min_3p_align: 4
# This parameter allows to exclude mappings to certain chromosomes, for example
# the Y-chromosome or the mitochondrial chromosme, if desired
# Note: - chromosome names must match the naming scheme of the used gtf file
# - currently, no chromosome is excluded because Y and M are commented out
excluded_chr:
# - M
# - Y
#-------------------------------------------------------------------------------
# assign 3' end sites that originate from internal priming
#-------------------------------------------------------------------------------
# define the genomic region around 3' end processing sites that is used
# in order to assess internal priming
# default: 10 nt downstream are considered to define internal priming events
upstream_region.IP: 0
downstream_region.IP: 10
# given the downstream region, set a value for the number of total As and
# consecutive As to occur such the site is marked
# as interal priming event
total_As: 7
consecutive_As: 6
# define patterns that are indicative of internal priming when they occur
# directly downstream of the cleavage site
downstream_patterns:
################################################################################
# combine closely space 3' end sites to 3' end clusters
################################################################################
#-------------------------------------------------------------------------------
# assign poly(A) signals
#-------------------------------------------------------------------------------
# the list of poly(A) sites that are annotated for single 3' end processing sites
# in the region -60 to +10 nt around the cleavage sites
polyA.signals:
- AATAAA
- ATTAAA
- TATAAA
- AGTAAA
- AATACA
- CATAAA
- AATATA
- GATAAA
- AATGAA
- AATAAT
- AAGAAA
- ACTAAA
- AATAGA
- ATTACA
- AACAAA
- ATTATA
- AACAAG
- AATAAG
#-------------------------------------------------------------------------------
# define a sample specific background (BG) of site expression
# for this purpose, run a rough clustering of sites
# filter out 3' end sites expressed not higher than the sample-specific BG
#-------------------------------------------------------------------------------
# cutoff, how much percent of clusters are required to have a poly(A) signal
polyAsignal_cutoff: 90
# regions, up- and downstream, around highly expressed 3' end sites to
# determine lower expressed sites in order to cluster them to the higher
# expressed one
upstream_bg_clusters: 25
downstream_bg_clusters: 25
#-------------------------------------------------------------------------------
# cluster together closely spaced 3' end sites
#-------------------------------------------------------------------------------
# up- and downstream regions to search for sites that get clustered to
# the current site (similar as above)
upstream_clusters: 12
downstream_clusters: 12
#-------------------------------------------------------------------------------
# merge closely spaced clusters
#-------------------------------------------------------------------------------
# define a maximum cluster size
# merged clusters must not be larger than this size (except they share the same
# poly(A) signals)
max_cluster_size: 25
# rescue clusters that are potential internal priming events when the most
# downsteam 3' end site of this cluster has a minimum distance to the next
# upstream poly(A) signal -> set this minimum distance here
min_dist_to_PAS: 15
#!/bin/sh
# properties = {properties}
PATH=[PATH TO CONDA ENV YOU ARE USING AND THAT NEEDS TO BE USED ON SLURM../bin]:${{PATH}}
export PATH
echo -e "JOB ID\t$JOBID"
echo "=============================="
echo -e "rule\t$NAME"
echo -e "==============================\n"
{exec_job}
# set -e
snakemake \
-p \
--configfile Aseq_config.yaml \
--cluster-config cluster_config.json \
--jobscript jobscript.withJOBID.withPATH.sh \
--cores=256 \
--local-cores 10 \
--cluster "sbatch \
--cpus-per-task {cluster.threads} \
--mem {cluster.mem} \
--qos {cluster.queue} \
--time {cluster.time} \
--job-name={cluster.name} \
-o {cluster.out} \
-p PARTITON" \
--latency-wait 60 \
--reason \
&>> Aseq.log
......@@ -12,11 +12,12 @@ my $TPM_clusters_gt_1nt = 0;
my $total_TPM = 0;
my %data = ();
my @LIBSIZE = ();
my @SAMPLE_NAME = ();
my $N = 0;
my $upstream;
my $downstream;
my $nr_of_samples;
my $last_sample_col;
GetOptions(
"upstream=i" => \$upstream,
......@@ -36,18 +37,19 @@ open( F, "gunzip -c $ARGV[0] |" );
while (<F>) {
chomp;
if ( $_ =~ m/^#(.+)/ ) {
# #3;output_dir/SRR3543897.3pSites.ip.bed.gz;2409790.05912655
my @F = split( /;/, $1 );
my @filepath = split( /\//, $F[1] );
my @filename = split ( /\./, $filepath[-1] );
$SAMPLE_NAME[ $F[0] ] = $filename[0];
$LIBSIZE[ $F[0] ] = $F[2];
next;
}
my @F = split(/[\t\s]+/);
$N++;
if( not defined $nr_of_samples ) {
my @tmp_ar = @F[3 .. ($#F-2)];
$nr_of_samples = scalar @tmp_ar;
}
$last_sample_col = $#F -2; # for looping over sample names;
my $OVERALL_TPM = 0;
my $n_supported = 0;
my @tmp = ();
......@@ -76,8 +78,8 @@ my $ip_candidates = 0;
# print header line
print "#chr\tstrand\tstart\tend\trep\ttotal_tpm\trepSite_signals";
foreach my $i (0 .. ($nr_of_samples - 1)) {
print "\tsample_",$i,"_tpm";
foreach my $idx ( 3 .. $last_sample_col ) {
print "\t",$SAMPLE_NAME[ $idx ],"_tpm";
}
print "\n";
......
......@@ -2,6 +2,8 @@ use strict;
use warnings;
use Getopt::Long;
# Modified 30.04.2021 by Mihaela Zavolan to allow one mismatch in the last x (min_align) nucleotides of the mapped read.
my $help;
my $debug;
my $strict;
......@@ -28,10 +30,6 @@ if ($showhelp) {
exit(2);
}
if ( $ARGV[0] !~ /gz$/ ) {
print STDERR "[ERROR] The input $ARGV[0] is required to be gzipped\n\n";
exit(2);
}
my %exclude_chr = ();
if ( defined $exclude) {
......@@ -55,7 +53,14 @@ foreach my $strand ( '+', '-' ) {
my %hash = ();
my %hashdebug = ();
open( F, "gzip -dc $ARGV[0] |" ) or die "Can't open pipe to $ARGV[0]\n";
if ( $ARGV[0] !~ /gz$/ ) {
open(F, $ARGV[0]);
} else {
open( F, "gzip -dc $ARGV[0] |" ) or die "Can't open pipe to $ARGV[0]\n";
}
while (<F>) {
chomp;
my @F = split(/\t/);
......@@ -64,32 +69,59 @@ while (<F>) {
next if ( defined $exclude_chr{ $F[0] } );
if ($strict) {
my $alignflag = 0;
#length of pattern for single character difference
# format: 15MATIGDCDT12 (15 matches -> 1 Mismatch( A ref, T in read) -> 1 insertion (G in read) -> 1 deletion (C in reference) -> 1 deletion (T in reference) -> 12 matches
my $len_scd = 3;
#plus strand alignment
if ( $F[5] eq '+' ) {
if ( $F[6] =~ m/^\d+$/ ) {
# perfect match: okay
$alignflag = 1;
} elsif ( $F[6] !~ m/\d+$/ ) {
# end badly aligned: skip
} else {
if ( $F[6] =~ m/(.*\D)(\d+)$/ ) {
if ( $2 >= $min_align ) {
$alignflag = 1;
}
if ( $F[6] =~ m/\D*(\d+)$/ ) {
my ($m) = $1;
# Number of matches at end high enough?
if($m >= $min_align) {
$alignflag = 1;
# if not, there's another chance if only 1 indel/mm follows
} elsif ( $F[6] =~ m/\D*(\d+)(\D+)(\d+)$/ ) {
my ($m1, $mm, $m2) = ($1, $2, $3);
#single indel/mistmatch within the last min_align characters of the alignment
if(length($mm) <= $len_scd) {
if($m1 + $m2 >= $min_align-1) {
$alignflag = 1;
}
}
}
} elsif ( $F[6] =~ m/\D*(\d+)(\D+)$/ ) {
my ($m, $mm) = ($1, $2);
# single indel/mismatch at the end of the alignment
if(length($mm) <= $len_scd && $m >= $min_align-1) {
$alignflag = 1;
}
}
#minus strand alignment
} else {
if ( $F[6] =~ m/^\d+$/ ) {
# perfect match: okay
$alignflag = 1;
} elsif ( $F[6] !~ m/^\d+/ ) {
# end badly aligned: skip
} else {
if ( $F[6] =~ m/^(\d+)(\D.*)/ ) {
if ( $1 >= $min_align ) {
if ( $F[6] =~ m/^(\d+)\D*/ ) {
my ($m) = $1;
# Number of matches at end high enough?
if($m >= $min_align) {
$alignflag = 1;
# If not, there's another chance if only 1 indel/mm follows
} elsif ( $F[6] =~ m/^(\d+)(\D+)(\d+)\D*/ ) {
my ($m1, $mm, $m2) = ($3, $2, $1);
#single indel/mistmatch within the last min_align characters of the alignment
if(length($mm) <= $len_scd) {
if($m1 + $m2 >= $min_align-1) {
$alignflag = 1;
}
}
}
}
} elsif ( $F[6] =~ m/^(\D+)(\d+)\D*/ ) {
my ($m, $mm) = ($2, $1);
# single indel/mismatch at the end of the alignment
if(length($mm) <= $len_scd && $m >= $min_align-1) {
$alignflag = 1;
}
}
}
next if ( $alignflag == 0 );
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment