diff --git a/Snakefile b/Snakefile index 133a792762d487e6c85ae269dc261f6fd6792ef4..73e93dc0dd00eb2da1169615e8798b480541d72e 100644 --- a/Snakefile +++ b/Snakefile @@ -1,5 +1,3 @@ -configfile: "Aseq_config.yaml" - from snakemake.utils import makedirs from snakemake.utils import listfiles @@ -21,14 +19,14 @@ rule finish: snakemake was run locally. ''' input: config["results_dir"] + "/counts/annotation_overlap/clusters.summary.tsv" - params: - logdir = config["log_dir"] - run: + #params: + # logdir = config["log_dir"] + #run: # check if cluster_logs dir for first sample is empty # if yes, delete the cluster_logs dir - file_list = os.listdir("{log}/cluster_logs/".format(log=params.logdir) ) - if len(file_list) == 0: - shell("rm -r {params.logdir}/cluster_logs") + #file_list = os.listdir("{log}/cluster_logs/".format(log=params.logdir) ) + #if len(file_list) == 0: + # shell("rm -r {params.logdir}/cluster_logs") ################################################################################ # chronological processing of the reads @@ -37,6 +35,7 @@ rule finish: #------------------------------------------------------------------------------- # create dir for logfiles #------------------------------------------------------------------------------- +cluster_logs = os.path.join( config["log_dir"], "cluster_logs") rule create_log_dir: ## LOCAL ## @@ -47,7 +46,7 @@ rule create_log_dir: output: temp( config["results_dir"] + "/created_log.tmp" ) run: - makedirs( [config["log_dir"], os.path.join( config["log_dir"], "cluster_logs") ] ) + makedirs( [config["log_dir"], cluster_logs] ) shell("touch {output}") #------------------------------------------------------------------------------- @@ -75,7 +74,7 @@ rule fastq2fasta: config["results_dir"] + "/created_log.tmp" output: temp(config["results_dir"] + "/{sample}.fa.gz") params: - cluster_log = lambda wildcards: config["log_dir"] + "/cluster_logs/" + wildcards.sample + ".log" + cluster_log = cluster_logs log: config["log_dir"] + "/fastq2fasta.{sample}.log" shell: @@ -116,7 +115,7 @@ rule select_for_valid_5p_configuration: output: temp(config["results_dir"] + "/{sample}.5ptrimmed.UMI_available.fa.gz") params: adapt=config['read_5p_start'], - cluster_log = lambda wildcards: config["log_dir"] + "/cluster_logs/" + wildcards.sample + ".log" + cluster_log = cluster_logs log: config["log_dir"] + "/filter_5p_read_start.{sample}.log" shell: @@ -160,9 +159,9 @@ rule collapse_full_UMI_duplicates: input: config["results_dir"] + "/{sample}.5ptrimmed.UMI_available.fa.gz" output: temp( config["results_dir"] + "/{sample}.UMI_dupl_removed.fa.gz" ) params: - cluster_log = lambda wildcards: config["log_dir"] + "/cluster_logs/" + wildcards.sample + ".log" + cluster_log = cluster_logs resources: - mem = 10 + mem = 20 log: config["log_dir"] + "/collapse_full_UMI_duplicates.{sample}.log" shell: @@ -208,7 +207,7 @@ rule trim_3p_adapter: params: adapt=config['3p_adapter'], minLen=config['min_length'], - cluster_log = lambda wildcards: config["log_dir"] + "/cluster_logs/" + wildcards.sample + ".log" + cluster_log = cluster_logs log: config["log_dir"] + "/trim_3p_adapter.{sample}.log" shell: @@ -261,7 +260,7 @@ rule get_valid_reads: input: config["results_dir"] + "/{sample}.trimmed.UMI.fa.gz" output: - temp( config["results_dir"] + "/{sample}.valid.trimmed.UMI.fa.gz" ) + config["results_dir"] + "/{sample}.valid.trimmed.UMI.fa.gz" params: maxN = config['maxN'], maxAcontent = config['maxAcontent'] @@ -308,16 +307,16 @@ rule valid_read_cnt: rule create_STAR_index: output: - config['STAR_idx'] + "/SAindex" + config['STAR_idx'] + str(config['read_length']) + "/SAindex" params: read_len = lambda wildcards: int(config['read_length']) - 7 - 1, genome = config['genome'], gtf = config['gene_annotation'], - idx_dir = config['STAR_idx'], - cluster_log = config["log_dir"] + "/cluster_logs/creat_STAR_index.log" + idx_dir = config['STAR_idx'] + str(config['read_length']), + cluster_log = cluster_logs threads: config["threads.STAR"] resources: - mem = lambda wildcards: int(np.ceil( config['STAR.max.RAM'] / config['threads.STAR'] ) ) + mem = config['STAR.max.RAM'] log: config["log_dir"] + "/create_STAR_index.log" shell: @@ -342,16 +341,16 @@ rule create_STAR_index: rule mapping: input: config["results_dir"] + "/{sample}.valid.trimmed.UMI.fa.gz", - config['STAR_idx'] + "/SAindex" + config['STAR_idx'] + str(config['read_length']) + "/SAindex" output: config["results_dir"] + "/{sample}.STAR_out/Aligned.sortedByCoord.out.bam" params: anno = config['gene_annotation'], outdir = config["results_dir"] + "/{sample}.STAR_out/", - idx_dir = config['STAR_idx'], - cluster_log = lambda wildcards: config["log_dir"] + "/cluster_logs/" + wildcards.sample + ".log" + idx_dir = config['STAR_idx'] + str(config['read_length']), + cluster_log = cluster_logs resources: - mem = lambda wildcards: int(np.ceil( config['STAR.max.RAM'] / config['threads.STAR'] ) ) + mem = config['STAR.max.RAM'] threads: config['threads.STAR'] log: config["log_dir"] + "/mapping.{sample}.log" @@ -387,9 +386,9 @@ rule bam2bed: input: config["results_dir"] + "/{sample}.STAR_out/Aligned.sortedByCoord.out.bam" output: - temp( config["results_dir"] + "/{sample}.reads.bed.gz" ) + config["results_dir"] + "/{sample}.reads.bed.gz" params: - cluster_log = lambda wildcards: config["log_dir"] + "/cluster_logs/" + wildcards.sample + ".log" + cluster_log = cluster_logs resources: mem=5 threads: config['threads.bam2bed'] @@ -412,9 +411,9 @@ rule select_by_edit_distance: input: config["results_dir"] + "/{sample}.reads.bed.gz" output: - temp( config["results_dir"] + "/{sample}.reads.edit_distance_filtered.bed.gz" ) + config["results_dir"] + "/{sample}.reads.edit_distance_filtered.bed.gz" params: - cluster_log = lambda wildcards: config["log_dir"] + "/cluster_logs/" + wildcards.sample + ".log" + cluster_log = cluster_logs resources: mem = 10 log: @@ -434,10 +433,10 @@ rule collapse_mapped_UMI_duplicates: input: config["results_dir"] + "/{sample}.reads.edit_distance_filtered.bed.gz" output: - temp(config["results_dir"] + "/{sample}.reads.collapsed.bed.gz"), + config["results_dir"] + "/{sample}.reads.collapsed.bed.gz", temp(config["results_dir"] + "/counts/{sample}.mapped.nr.out") params: - cluster_log = lambda wildcards: config["log_dir"] + "/cluster_logs/" + wildcards.sample + ".log" + cluster_log = cluster_logs log: config["log_dir"] + "/collapse_mapped_UMI_duplicates.{sample}.log" shell: @@ -481,11 +480,11 @@ rule get_3p_ends: input: config["results_dir"] + "/{sample}.reads.collapsed.bed.gz" output: - temp( config["results_dir"] + "/{sample}.3pSites.bed.gz") + config["results_dir"] + "/{sample}.3pSites.bed.gz" params: exclude_chr=lambda wildcards: "--exclude=" + ":".join(config['excluded_chr']) if config['excluded_chr'] is not None else "", min_align = config['min_3p_align'], - cluster_log = lambda wildcards: config["log_dir"] + "/cluster_logs/" + wildcards.sample + ".log" + cluster_log = cluster_logs resources: mem = 10 log: @@ -536,7 +535,7 @@ rule fetch_flanking_seqs: upstream_ext = config['upstream_region.IP'], downstream_ext = config['downstream_region.IP'], genome = config["genome"], - cluster_log = lambda wildcards: config["log_dir"] + "/cluster_logs/" + wildcards.sample + ".log" + cluster_log = cluster_logs resources: mem = 10 log: @@ -566,7 +565,7 @@ rule assign_IP_sites: tot_As = config['total_As'], consec_As = config['consecutive_As'], ds_patterns = lambda wildcards: " ".join(["--ds_pattern=%s" % pat for pat in config['downstream_patterns'] ]) if config['downstream_patterns'] is not None else "", - cluster_log = lambda wildcards: config["log_dir"] + "/cluster_logs/" + wildcards.sample + ".log" + cluster_log = cluster_logs log: config["log_dir"] + "/assign_IP_sites.{sample}.log" shell: @@ -641,7 +640,7 @@ rule sites_feature_overlap_stats: params: gtf = config['gene_annotation'], utr_name = config['utr_name'], - cluster_log = lambda wildcards: config["log_dir"] + "/cluster_logs/" + wildcards.sample + ".log" + cluster_log = cluster_logs log: config["log_dir"] + "/sites_feature_overlap_stats.{sample}.log" shell: @@ -665,7 +664,7 @@ rule pool_samples: output: config["results_dir"] + "/3pSites.tsv.gz" params: - cluster_log = config["log_dir"] + "/cluster_logs/clustering_process.log" + cluster_log = cluster_logs resources: mem = lambda wildcards: int( 1.5 * len(config['samples']) ) log: @@ -734,7 +733,9 @@ rule assign_polyA_signals: params: signals = lambda wildcards: " ".join(["--motif=%s" % sig for sig in config['polyA.signals'] ]), genome = config['genome'], - cluster_log = config["log_dir"] + "/cluster_logs/clustering_process.log" + cluster_log = cluster_logs + resources: + mem = 20 log: config["log_dir"] + "/assign_polyA_signals.log" shell: @@ -758,12 +759,12 @@ rule sample_specific_bg: input: config["results_dir"] + "/3pSites.PAS.tsv.gz" output: - temp( config["results_dir"] + "/filteredSites/{sample}.filtered.tsv" ) + config["results_dir"] + "/filteredSites/{sample}.filtered.tsv" params: cutoff = config['polyAsignal_cutoff'], upstream_reg = config['upstream_bg_clusters'], downstream_reg = config['downstream_bg_clusters'], - cluster_log = lambda wildcards: config["log_dir"] + "/cluster_logs/" + wildcards.sample + ".log" + cluster_log = cluster_logs resources: mem = 10 log: @@ -790,7 +791,7 @@ rule create_noBG_3pSites_table: output: table_adjusted = temp(config["results_dir"] + "/3pSites.PAS.filtered.tsv.gz") params: - cluster_log = config["log_dir"] + "/cluster_logs/create_noBG_3pSites_table.log" + cluster_log = cluster_logs resources: mem = 10 log: @@ -807,7 +808,7 @@ rule delete_zero_rows: input: config["results_dir"] + "/3pSites.PAS.filtered.tsv.gz" output: - temp( config["results_dir"] + "/3pSites.PAS.noBG.tsv.gz" ) + config["results_dir"] + "/3pSites.PAS.noBG.tsv.gz" log: config["log_dir"] + "/delete_zero_rows.log" run: @@ -833,11 +834,11 @@ rule cluster_sites: input: config["results_dir"] + "/3pSites.PAS.noBG.tsv.gz" output: - temp( config["results_dir"] + "/clusters.primary.tsv.gz" ) + config["results_dir"] + "/clusters.primary.tsv.gz" params: upstream_ext = config['upstream_clusters'], downstream_ext = config['downstream_clusters'], - cluster_log = config["log_dir"] + "/cluster_logs/clustering_process.log" + cluster_log = cluster_logs log: config["log_dir"] + "/cluster_sites.log" shell: @@ -868,7 +869,7 @@ rule merge_clusters: params: maxsize = config['max_cluster_size'], minDistToPAS = config['min_dist_to_PAS'], - cluster_log = config["log_dir"] + "/cluster_logs/clustering_process.log" + cluster_log = cluster_logs log: config["log_dir"] + "/merge_clusters.log" shell: @@ -890,7 +891,7 @@ rule clusters_to_bed: output: config["results_dir"] + "/clusters.merged.bed" params: - cluster_log = config["log_dir"] + "/cluster_logs/clustering_process.log" + cluster_log = cluster_logs log: config["log_dir"] + "/clusters_to_bed.log" shell: @@ -941,7 +942,7 @@ rule clusters_feature_overlap_stats: params: gtf = config['gene_annotation'], utr_name = config['utr_name'], - cluster_log = config["log_dir"] + "/cluster_logs/clustering_process.log" + cluster_log = cluster_logs log: config["log_dir"] + "/clusters_feature_overlap_stats.log" shell: diff --git a/cluster_config.json b/cluster_config.json index 46a5971a2672130b730c04d15b2e41e448b378d8..845a416548a6dfaf2b3ec173e30be7f0419bbfff 100644 --- a/cluster_config.json +++ b/cluster_config.json @@ -1,9 +1,12 @@ { "__default__": { - "threads":"1", - "mem":"2G", - "time":"01:00:00" + "queue":"6hours", + "threads":"1", + "mem":"4G", + "time":"01:00:00", + "name":"{rule}.{wildcards}", + "out":"{params.cluster_log}/{rule}.{wildcards}-%j-%N.out" }, "collapse_full_UMI_duplicates": { @@ -12,12 +15,14 @@ "create_STAR_index": { "threads": "{threads}", - "mem": "{resources.mem}G" + "mem": "{resources.mem}G", + "time": "06:00:00" }, "mapping": { "threads": "{threads}", - "mem": "{resources.mem}G" + "mem": "{resources.mem}G", + "time": "06:00:00" }, "select_by_edit_distance": { @@ -36,6 +41,15 @@ { "mem": "{resources.mem}G" }, + "pool_samples": + { + "mem": "{resources.mem}G" + }, + "assign_polyA_signals": + { + "time":"06:00:00", + "mem": "{resources.mem}G" + }, "sample_specific_bg": { "mem": "{resources.mem}G" diff --git a/config.yaml b/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..51d6f1f30753b4a31001e97aa3603610d4444c53 --- /dev/null +++ b/config.yaml @@ -0,0 +1,230 @@ +--- +################################################################################ +# directories +################################################################################ + +# directory, where is/are the input fastq file(s) located +input_dir: input + +# directory, in which the pipeline writes all output files (relative to current directory) +# (this also includes the summary files) +results_dir: output + +# directory to write the log-files, created during processing, to +log_dir: logs + + +################################################################################ +# sample(s) information +################################################################################ + +# name(s) of the input fastq file(s) +# if the sample is named: sample1.fq.gz: +# please type here: sample1 +# this name is used throughout the entire pipeline as name for the output samples +samples: + - + + +# read length (i.e. number of sequencing cycles) +# how long are the reads of this study? +read_length: 63 + + +################################################################################ +# genome information +################################################################################ + +# after cloning, the "resource" directory only contains an Ensembl gtf file +# of the human genome +gene_annotation: resources/Homo_sapiens.GRCh38.87.gtf + +# Have a look in the README file if you want or need to use a different gtf + +# indicate the name 3' UTRs are labeled with in the gtf file +# note: in case 5' and 3' UTRs are labeled with one name: +# summaries with the fraction of sites that overlap with UTR features +# might include overlaps with 5' UTRs (however, this fractions is +# expected to be small) +utr_name: three_prime_utr + +# genomic sequences (again, consult the README to find the command that +# allows to download the genome in fasta format from Ensembl) +# after downloading, enter the path and the file name here +genome: resources/Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa + + +################################################################################ +# read_preprocessing +################################################################################ + +#------------------------------------------------------------------------------- +# adapter trimming +#------------------------------------------------------------------------------- + +# the 5' configuration reads are expected to start with +# note: "." characters represent any random nucleotide +# note: The last three nucleotides are required to be three T! The number +# of prior random nucleotides is adjustable according to the used +# setup +read_5p_start: ....TTT + +# the 3' adapter that is trimmed from the 3' end of reads +3p_adapter: TGGAATTCTCGGGTGCCAAGG + +# the minimum read length, each read must have after adapter trimming (5' and 3') +# (shorter reads are discarded) +min_length: 15 + +#------------------------------------------------------------------------------- +# valid read parameters +#------------------------------------------------------------------------------- + +# reads are considered as valid based on the following measures + +# maximum number of Ns per read/sequence +maxN: 2 + +# maximum fraction of As per read/sequence +maxAcontent: 0.8 + +# last nucleotide is no A (no paramter value necessary here) + + +################################################################################ +# mapping to 3' end site inference +################################################################################ + +#------------------------------------------------------------------------------- +# STAR mapping +#------------------------------------------------------------------------------- + +# the directory in which the STAR genome index will be stored +# the first run of the pipeline will generate the index automatically +# and will store it at the provided directory +STAR_idx: resources/Homo_sapiens.GRCh38.index + +# STAR requires some memory to work efficiently +# if you plan to run the pipeline on a cluster, the STAR.max.RAM +# submits the indexing and mapping job with this number of GB of RAM +STAR.max.RAM: 50 + +# STAR is much faster when it is run in parallel mode +# set here the number of available cores STAR is allowed to run on +threads.STAR: 12 + +#------------------------------------------------------------------------------- +# convert BAM to BED +#------------------------------------------------------------------------------- + +# number of threads used for this task +threads.bam2bed: 8 + +#------------------------------------------------------------------------------- +# get initial raw 3' end processing sites +#------------------------------------------------------------------------------- + +# The last x nucleotides at the 3' end of each read should map perfectly to +# the genomic sequence. This ensures that true 3' end processing sites +# are inferrred. +# New: allow 1 mismatch/insertion/deletion in last x nucleotides +# Set x here: +min_3p_align: 4 + +# This parameter allows to exclude mappings to certain chromosomes, for example +# the Y-chromosome or the mitochondrial chromosme, if desired +# Note: - chromosome names must match the naming scheme of the used gtf file +# - currently, no chromosome is excluded because Y and M are commented out +excluded_chr: +# - M +# - Y + +#------------------------------------------------------------------------------- +# assign 3' end sites that originate from internal priming +#------------------------------------------------------------------------------- + +# define the genomic region around 3' end processing sites that is used +# in order to assess internal priming +# default: 10 nt downstream are considered to define internal priming events +upstream_region.IP: 0 +downstream_region.IP: 10 + +# given the downstream region, set a value for the number of total As and +# consecutive As to occur such the site is marked +# as interal priming event +total_As: 7 +consecutive_As: 6 + +# define patterns that are indicative of internal priming when they occur +# directly downstream of the cleavage site +downstream_patterns: + + + +################################################################################ +# combine closely space 3' end sites to 3' end clusters +################################################################################ + +#------------------------------------------------------------------------------- +# assign poly(A) signals +#------------------------------------------------------------------------------- + +# the list of poly(A) sites that are annotated for single 3' end processing sites +# in the region -60 to +10 nt around the cleavage sites +polyA.signals: + - AATAAA + - ATTAAA + - TATAAA + - AGTAAA + - AATACA + - CATAAA + - AATATA + - GATAAA + - AATGAA + - AATAAT + - AAGAAA + - ACTAAA + - AATAGA + - ATTACA + - AACAAA + - ATTATA + - AACAAG + - AATAAG + +#------------------------------------------------------------------------------- +# define a sample specific background (BG) of site expression +# for this purpose, run a rough clustering of sites +# filter out 3' end sites expressed not higher than the sample-specific BG +#------------------------------------------------------------------------------- + +# cutoff, how much percent of clusters are required to have a poly(A) signal +polyAsignal_cutoff: 90 +# regions, up- and downstream, around highly expressed 3' end sites to +# determine lower expressed sites in order to cluster them to the higher +# expressed one +upstream_bg_clusters: 25 +downstream_bg_clusters: 25 + + +#------------------------------------------------------------------------------- +# cluster together closely spaced 3' end sites +#------------------------------------------------------------------------------- + +# up- and downstream regions to search for sites that get clustered to +# the current site (similar as above) +upstream_clusters: 12 +downstream_clusters: 12 + +#------------------------------------------------------------------------------- +# merge closely spaced clusters +#------------------------------------------------------------------------------- + +# define a maximum cluster size +# merged clusters must not be larger than this size (except they share the same +# poly(A) signals) +max_cluster_size: 25 + +# rescue clusters that are potential internal priming events when the most +# downsteam 3' end site of this cluster has a minimum distance to the next +# upstream poly(A) signal -> set this minimum distance here +min_dist_to_PAS: 15 diff --git a/jobscript.sh b/jobscript.sh index bb186e89e3578da83a8bf6667a1b7ead5eabd1ac..8fd55f95b62aa1d53003481100ce2291b9b16b2f 100644 --- a/jobscript.sh +++ b/jobscript.sh @@ -1,3 +1,12 @@ #!/bin/sh # properties = {properties} +PATH=[PATH TO CONDA ENV YOU ARE USING AND THAT NEEDS TO BE USED ON SLURM../bin]:${{PATH}} +export PATH + +echo -e "JOB ID\t$JOBID" +echo "==============================" +echo -e "rule\t$NAME" +echo -e "==============================\n" + +{exec_job} diff --git a/run_slurm.sh b/run_slurm.sh new file mode 100644 index 0000000000000000000000000000000000000000..9eea2e00c0accb9a01944ca12df942b626779383 --- /dev/null +++ b/run_slurm.sh @@ -0,0 +1,20 @@ +# set -e + +snakemake \ +-p \ +--configfile Aseq_config.yaml \ +--cluster-config cluster_config.json \ +--jobscript jobscript.withJOBID.withPATH.sh \ +--cores=256 \ +--local-cores 10 \ +--cluster "sbatch \ + --cpus-per-task {cluster.threads} \ + --mem {cluster.mem} \ + --qos {cluster.queue} \ + --time {cluster.time} \ + --job-name={cluster.name} \ + -o {cluster.out} \ + -p PARTITON" \ +--latency-wait 60 \ +--reason \ +&>> Aseq.log diff --git a/scripts/ag-generate-clusters.pl b/scripts/ag-generate-clusters.pl index d1885caa9cd699e2da5591c320a0f42e005f18c9..990088806373f17786ccbbfd4baee9b2cec07b35 100644 --- a/scripts/ag-generate-clusters.pl +++ b/scripts/ag-generate-clusters.pl @@ -12,11 +12,12 @@ my $TPM_clusters_gt_1nt = 0; my $total_TPM = 0; my %data = (); my @LIBSIZE = (); +my @SAMPLE_NAME = (); my $N = 0; my $upstream; my $downstream; -my $nr_of_samples; +my $last_sample_col; GetOptions( "upstream=i" => \$upstream, @@ -36,18 +37,19 @@ open( F, "gunzip -c $ARGV[0] |" ); while (<F>) { chomp; if ( $_ =~ m/^#(.+)/ ) { + # #3;output_dir/SRR3543897.3pSites.ip.bed.gz;2409790.05912655 my @F = split( /;/, $1 ); + my @filepath = split( /\//, $F[1] ); + my @filename = split ( /\./, $filepath[-1] ); + $SAMPLE_NAME[ $F[0] ] = $filename[0]; $LIBSIZE[ $F[0] ] = $F[2]; next; } + my @F = split(/[\t\s]+/); $N++; - if( not defined $nr_of_samples ) { - my @tmp_ar = @F[3 .. ($#F-2)]; - $nr_of_samples = scalar @tmp_ar; - } - + $last_sample_col = $#F -2; # for looping over sample names; my $OVERALL_TPM = 0; my $n_supported = 0; my @tmp = (); @@ -76,8 +78,8 @@ my $ip_candidates = 0; # print header line print "#chr\tstrand\tstart\tend\trep\ttotal_tpm\trepSite_signals"; -foreach my $i (0 .. ($nr_of_samples - 1)) { - print "\tsample_",$i,"_tpm"; +foreach my $idx ( 3 .. $last_sample_col ) { + print "\t",$SAMPLE_NAME[ $idx ],"_tpm"; } print "\n"; diff --git a/scripts/rs-get-3pEnds-from-bed.pl b/scripts/rs-get-3pEnds-from-bed.pl index 9b70563ec494ef06f5c33dede804659c8c7e97b5..975d9a67ac51c69302a706b311092746b8bbfb3b 100644 --- a/scripts/rs-get-3pEnds-from-bed.pl +++ b/scripts/rs-get-3pEnds-from-bed.pl @@ -2,6 +2,8 @@ use strict; use warnings; use Getopt::Long; +# Modified 30.04.2021 by Mihaela Zavolan to allow one mismatch in the last x (min_align) nucleotides of the mapped read. + my $help; my $debug; my $strict; @@ -28,10 +30,6 @@ if ($showhelp) { exit(2); } -if ( $ARGV[0] !~ /gz$/ ) { - print STDERR "[ERROR] The input $ARGV[0] is required to be gzipped\n\n"; - exit(2); -} my %exclude_chr = (); if ( defined $exclude) { @@ -55,7 +53,14 @@ foreach my $strand ( '+', '-' ) { my %hash = (); my %hashdebug = (); -open( F, "gzip -dc $ARGV[0] |" ) or die "Can't open pipe to $ARGV[0]\n"; + +if ( $ARGV[0] !~ /gz$/ ) { + open(F, $ARGV[0]); +} else { + open( F, "gzip -dc $ARGV[0] |" ) or die "Can't open pipe to $ARGV[0]\n"; +} + + while (<F>) { chomp; my @F = split(/\t/); @@ -64,32 +69,59 @@ while (<F>) { next if ( defined $exclude_chr{ $F[0] } ); if ($strict) { my $alignflag = 0; + + #length of pattern for single character difference + # format: 15MATIGDCDT12 (15 matches -> 1 Mismatch( A ref, T in read) -> 1 insertion (G in read) -> 1 deletion (C in reference) -> 1 deletion (T in reference) -> 12 matches + my $len_scd = 3; + + #plus strand alignment if ( $F[5] eq '+' ) { - if ( $F[6] =~ m/^\d+$/ ) { - # perfect match: okay - $alignflag = 1; - } elsif ( $F[6] !~ m/\d+$/ ) { - # end badly aligned: skip - } else { - if ( $F[6] =~ m/(.*\D)(\d+)$/ ) { - if ( $2 >= $min_align ) { - $alignflag = 1; - } + if ( $F[6] =~ m/\D*(\d+)$/ ) { + my ($m) = $1; + # Number of matches at end high enough? + if($m >= $min_align) { + $alignflag = 1; + # if not, there's another chance if only 1 indel/mm follows + } elsif ( $F[6] =~ m/\D*(\d+)(\D+)(\d+)$/ ) { + my ($m1, $mm, $m2) = ($1, $2, $3); + #single indel/mistmatch within the last min_align characters of the alignment + if(length($mm) <= $len_scd) { + if($m1 + $m2 >= $min_align-1) { + $alignflag = 1; + } + } + } + } elsif ( $F[6] =~ m/\D*(\d+)(\D+)$/ ) { + my ($m, $mm) = ($1, $2); + # single indel/mismatch at the end of the alignment + if(length($mm) <= $len_scd && $m >= $min_align-1) { + $alignflag = 1; } } + #minus strand alignment } else { - if ( $F[6] =~ m/^\d+$/ ) { - # perfect match: okay - $alignflag = 1; - } elsif ( $F[6] !~ m/^\d+/ ) { - # end badly aligned: skip - } else { - if ( $F[6] =~ m/^(\d+)(\D.*)/ ) { - if ( $1 >= $min_align ) { + if ( $F[6] =~ m/^(\d+)\D*/ ) { + my ($m) = $1; + # Number of matches at end high enough? + if($m >= $min_align) { + $alignflag = 1; + # If not, there's another chance if only 1 indel/mm follows + } elsif ( $F[6] =~ m/^(\d+)(\D+)(\d+)\D*/ ) { + my ($m1, $mm, $m2) = ($3, $2, $1); + #single indel/mistmatch within the last min_align characters of the alignment + if(length($mm) <= $len_scd) { + if($m1 + $m2 >= $min_align-1) { $alignflag = 1; + } } } - } + } elsif ( $F[6] =~ m/^(\D+)(\d+)\D*/ ) { + my ($m, $mm) = ($2, $1); + # single indel/mismatch at the end of the alignment + if(length($mm) <= $len_scd && $m >= $min_align-1) { + $alignflag = 1; + } + } } next if ( $alignflag == 0 ); }