adapted for slurm, allow 1 mismatch in last 4nt

35d0f5c7 · CJHerrmann · 90c6ef59 · 35d0f5c7 · 35d0f5c7 · 35d0f5c7
Commit 35d0f5c7 authored 3 years ago by CJHerrmann
--- a/Snakefile
+++ b/Snakefile
-configfile: "Aseq_config.yaml"
-
 from snakemake.utils import makedirs
 from snakemake.utils import listfiles

@@ -21,14 +19,14 @@ rule finish:
    snakemake was run locally.
    '''
    input: config["results_dir"] + "/counts/annotation_overlap/clusters.summary.tsv"
-    params:
-        logdir = config["log_dir"]
-    run:
+    #params:
+    #    logdir = config["log_dir"]
+    #run:
        # check if cluster_logs dir for first sample is empty
        # if yes, delete the cluster_logs dir
-        file_list = os.listdir("{log}/cluster_logs/".format(log=params.logdir) )
-        if len(file_list) == 0:
-            shell("rm -r {params.logdir}/cluster_logs")
+        #file_list = os.listdir("{log}/cluster_logs/".format(log=params.logdir) )
+        #if len(file_list) == 0:
+        #    shell("rm -r {params.logdir}/cluster_logs")

 ################################################################################
 # chronological processing of the reads
@@ -37,6 +35,7 @@ rule finish:
 #-------------------------------------------------------------------------------
 # create dir for logfiles
 #-------------------------------------------------------------------------------
+cluster_logs = os.path.join( config["log_dir"], "cluster_logs")

 rule create_log_dir:
    ## LOCAL ##
@@ -47,7 +46,7 @@ rule create_log_dir:
    output:
        temp( config["results_dir"] + "/created_log.tmp" )
    run:
-        makedirs( [config["log_dir"], os.path.join( config["log_dir"], "cluster_logs") ] )
+        makedirs( [config["log_dir"],  cluster_logs] )
        shell("touch {output}")

 #-------------------------------------------------------------------------------
@@ -75,7 +74,7 @@ rule fastq2fasta:
        config["results_dir"] + "/created_log.tmp"
    output: temp(config["results_dir"] + "/{sample}.fa.gz")
    params:
-        cluster_log = lambda wildcards: config["log_dir"] + "/cluster_logs/" + wildcards.sample + ".log"
+        cluster_log = cluster_logs
    log:
        config["log_dir"] + "/fastq2fasta.{sample}.log"
    shell:
@@ -116,7 +115,7 @@ rule select_for_valid_5p_configuration:
    output: temp(config["results_dir"] + "/{sample}.5ptrimmed.UMI_available.fa.gz")
    params:
        adapt=config['read_5p_start'],
-        cluster_log = lambda wildcards: config["log_dir"] + "/cluster_logs/" + wildcards.sample + ".log"
+        cluster_log = cluster_logs
    log:
        config["log_dir"] + "/filter_5p_read_start.{sample}.log"
    shell:
@@ -160,9 +159,9 @@ rule collapse_full_UMI_duplicates:
    input: config["results_dir"] + "/{sample}.5ptrimmed.UMI_available.fa.gz"
    output: temp( config["results_dir"] + "/{sample}.UMI_dupl_removed.fa.gz" )
    params:
-        cluster_log = lambda wildcards: config["log_dir"] + "/cluster_logs/" + wildcards.sample + ".log"
+        cluster_log = cluster_logs
    resources:
-        mem = 10
+        mem = 20
    log:
        config["log_dir"] + "/collapse_full_UMI_duplicates.{sample}.log"
    shell:
@@ -208,7 +207,7 @@ rule trim_3p_adapter:
    params:
        adapt=config['3p_adapter'],
        minLen=config['min_length'],
-        cluster_log = lambda wildcards: config["log_dir"] + "/cluster_logs/" + wildcards.sample + ".log"
+        cluster_log = cluster_logs
    log:
        config["log_dir"] + "/trim_3p_adapter.{sample}.log"
    shell:
@@ -261,7 +260,7 @@ rule get_valid_reads:
    input:
        config["results_dir"] + "/{sample}.trimmed.UMI.fa.gz"
    output:
-        temp( config["results_dir"] + "/{sample}.valid.trimmed.UMI.fa.gz" )
+        config["results_dir"] + "/{sample}.valid.trimmed.UMI.fa.gz" 
    params:
        maxN = config['maxN'],
        maxAcontent = config['maxAcontent']
@@ -308,16 +307,16 @@ rule valid_read_cnt:

 rule create_STAR_index:
    output:
-        config['STAR_idx'] + "/SAindex"
+        config['STAR_idx'] + str(config['read_length']) + "/SAindex"
    params:
        read_len = lambda wildcards: int(config['read_length']) - 7 - 1,
        genome = config['genome'],
        gtf = config['gene_annotation'],
-        idx_dir = config['STAR_idx'],
-        cluster_log = config["log_dir"] + "/cluster_logs/creat_STAR_index.log"
+        idx_dir = config['STAR_idx'] + str(config['read_length']),
+        cluster_log = cluster_logs
    threads: config["threads.STAR"]
    resources:
-        mem = lambda wildcards: int(np.ceil( config['STAR.max.RAM'] / config['threads.STAR'] ) )
+        mem = config['STAR.max.RAM']
    log:
        config["log_dir"] + "/create_STAR_index.log"
    shell:
@@ -342,16 +341,16 @@ rule create_STAR_index:
 rule mapping:
    input:
        config["results_dir"] + "/{sample}.valid.trimmed.UMI.fa.gz",
-        config['STAR_idx'] + "/SAindex"
+        config['STAR_idx'] + str(config['read_length']) + "/SAindex"
    output:
        config["results_dir"] + "/{sample}.STAR_out/Aligned.sortedByCoord.out.bam"
    params:
        anno = config['gene_annotation'],
        outdir = config["results_dir"] + "/{sample}.STAR_out/",
-        idx_dir = config['STAR_idx'],
-        cluster_log = lambda wildcards: config["log_dir"] + "/cluster_logs/" + wildcards.sample + ".log"
+        idx_dir = config['STAR_idx'] + str(config['read_length']),
+        cluster_log = cluster_logs
    resources:
-        mem = lambda wildcards: int(np.ceil( config['STAR.max.RAM'] / config['threads.STAR'] ) )
+        mem = config['STAR.max.RAM'] 
    threads: config['threads.STAR']
    log:
        config["log_dir"] + "/mapping.{sample}.log"
@@ -387,9 +386,9 @@ rule bam2bed:
    input:
        config["results_dir"] + "/{sample}.STAR_out/Aligned.sortedByCoord.out.bam"
    output:
-        temp( config["results_dir"] + "/{sample}.reads.bed.gz" )
+        config["results_dir"] + "/{sample}.reads.bed.gz"
    params:
-        cluster_log = lambda wildcards: config["log_dir"] + "/cluster_logs/" + wildcards.sample + ".log"
+        cluster_log = cluster_logs
    resources:
        mem=5
    threads: config['threads.bam2bed']
@@ -412,9 +411,9 @@ rule select_by_edit_distance:
    input:
        config["results_dir"] + "/{sample}.reads.bed.gz"
    output:
-        temp( config["results_dir"] + "/{sample}.reads.edit_distance_filtered.bed.gz" )
+        config["results_dir"] + "/{sample}.reads.edit_distance_filtered.bed.gz" 
    params:
-        cluster_log = lambda wildcards: config["log_dir"] + "/cluster_logs/" + wildcards.sample + ".log"
+        cluster_log = cluster_logs
    resources:
        mem = 10
    log:
@@ -434,10 +433,10 @@ rule collapse_mapped_UMI_duplicates:
    input:
        config["results_dir"] + "/{sample}.reads.edit_distance_filtered.bed.gz"
    output:
-        temp(config["results_dir"] + "/{sample}.reads.collapsed.bed.gz"),
+        config["results_dir"] + "/{sample}.reads.collapsed.bed.gz",
        temp(config["results_dir"] + "/counts/{sample}.mapped.nr.out")
    params:
-        cluster_log = lambda wildcards: config["log_dir"] + "/cluster_logs/" + wildcards.sample + ".log"
+        cluster_log = cluster_logs
    log:
       config["log_dir"] + "/collapse_mapped_UMI_duplicates.{sample}.log"
    shell:
@@ -481,11 +480,11 @@ rule get_3p_ends:
    input:
        config["results_dir"] + "/{sample}.reads.collapsed.bed.gz"
    output:
-        temp( config["results_dir"] + "/{sample}.3pSites.bed.gz")
+        config["results_dir"] + "/{sample}.3pSites.bed.gz"
    params:
        exclude_chr=lambda wildcards: "--exclude=" + ":".join(config['excluded_chr']) if config['excluded_chr'] is not None else "",
        min_align = config['min_3p_align'],
-        cluster_log = lambda wildcards: config["log_dir"] + "/cluster_logs/" + wildcards.sample + ".log"
+        cluster_log = cluster_logs
    resources:
        mem = 10
    log:
@@ -536,7 +535,7 @@ rule fetch_flanking_seqs:
        upstream_ext = config['upstream_region.IP'],
        downstream_ext = config['downstream_region.IP'],
        genome = config["genome"],
-        cluster_log = lambda wildcards: config["log_dir"] + "/cluster_logs/" + wildcards.sample + ".log"
+        cluster_log = cluster_logs
    resources:
        mem = 10
    log:
@@ -566,7 +565,7 @@ rule assign_IP_sites:
        tot_As = config['total_As'],
        consec_As = config['consecutive_As'],
        ds_patterns = lambda wildcards: " ".join(["--ds_pattern=%s" % pat for pat in config['downstream_patterns'] ]) if config['downstream_patterns'] is not None else "",
-        cluster_log = lambda wildcards: config["log_dir"] + "/cluster_logs/" + wildcards.sample + ".log"
+        cluster_log = cluster_logs
    log:
        config["log_dir"] + "/assign_IP_sites.{sample}.log"
    shell:
@@ -641,7 +640,7 @@ rule sites_feature_overlap_stats:
    params:
        gtf = config['gene_annotation'],
        utr_name = config['utr_name'],
-        cluster_log = lambda wildcards: config["log_dir"] + "/cluster_logs/" + wildcards.sample + ".log"
+        cluster_log = cluster_logs
    log:
        config["log_dir"] + "/sites_feature_overlap_stats.{sample}.log"
    shell:
@@ -665,7 +664,7 @@ rule pool_samples:
    output:
        config["results_dir"] + "/3pSites.tsv.gz"
    params:
-        cluster_log = config["log_dir"] + "/cluster_logs/clustering_process.log"
+        cluster_log = cluster_logs
    resources:
        mem = lambda wildcards: int( 1.5 * len(config['samples']) )
    log:
@@ -734,7 +733,9 @@ rule assign_polyA_signals:
    params:
        signals = lambda wildcards: " ".join(["--motif=%s" % sig for sig in config['polyA.signals'] ]),
        genome = config['genome'],
-        cluster_log = config["log_dir"] + "/cluster_logs/clustering_process.log"
+        cluster_log = cluster_logs
+    resources:
+        mem = 20
    log:
        config["log_dir"] + "/assign_polyA_signals.log"
    shell:
@@ -758,12 +759,12 @@ rule sample_specific_bg:
    input:
        config["results_dir"] + "/3pSites.PAS.tsv.gz"
    output:
-         temp( config["results_dir"] + "/filteredSites/{sample}.filtered.tsv" )
+         config["results_dir"] + "/filteredSites/{sample}.filtered.tsv" 
    params:
        cutoff = config['polyAsignal_cutoff'],
        upstream_reg = config['upstream_bg_clusters'],
        downstream_reg = config['downstream_bg_clusters'],
-        cluster_log = lambda wildcards: config["log_dir"] + "/cluster_logs/" + wildcards.sample + ".log"
+        cluster_log = cluster_logs
    resources:
        mem = 10
    log:
@@ -790,7 +791,7 @@ rule create_noBG_3pSites_table:
    output:
        table_adjusted = temp(config["results_dir"] + "/3pSites.PAS.filtered.tsv.gz")
    params:
-        cluster_log = config["log_dir"] + "/cluster_logs/create_noBG_3pSites_table.log"
+        cluster_log = cluster_logs
    resources:
        mem = 10
    log:
@@ -807,7 +808,7 @@ rule delete_zero_rows:
    input:
        config["results_dir"] + "/3pSites.PAS.filtered.tsv.gz"
    output:
-        temp( config["results_dir"] + "/3pSites.PAS.noBG.tsv.gz" )
+        config["results_dir"] + "/3pSites.PAS.noBG.tsv.gz" 
    log:
        config["log_dir"] + "/delete_zero_rows.log"
    run:
@@ -833,11 +834,11 @@ rule cluster_sites:
    input:
        config["results_dir"] + "/3pSites.PAS.noBG.tsv.gz"
    output:
-        temp( config["results_dir"] + "/clusters.primary.tsv.gz" )
+        config["results_dir"] + "/clusters.primary.tsv.gz"
    params:
        upstream_ext = config['upstream_clusters'],
        downstream_ext = config['downstream_clusters'],
-        cluster_log = config["log_dir"] + "/cluster_logs/clustering_process.log"
+        cluster_log = cluster_logs
    log:
        config["log_dir"] + "/cluster_sites.log"
    shell:
@@ -868,7 +869,7 @@ rule merge_clusters:
    params:
        maxsize = config['max_cluster_size'],
        minDistToPAS = config['min_dist_to_PAS'],
-        cluster_log = config["log_dir"] + "/cluster_logs/clustering_process.log"
+        cluster_log = cluster_logs
    log:
        config["log_dir"] + "/merge_clusters.log"
    shell:
@@ -890,7 +891,7 @@ rule clusters_to_bed:
    output:
        config["results_dir"] + "/clusters.merged.bed"
    params:
-        cluster_log = config["log_dir"] + "/cluster_logs/clustering_process.log"
+        cluster_log = cluster_logs
    log:
        config["log_dir"] + "/clusters_to_bed.log"
    shell:
@@ -941,7 +942,7 @@ rule clusters_feature_overlap_stats:
    params:
        gtf = config['gene_annotation'],
        utr_name = config['utr_name'],
-        cluster_log = config["log_dir"] + "/cluster_logs/clustering_process.log"
+        cluster_log = cluster_logs
    log:
        config["log_dir"] + "/clusters_feature_overlap_stats.log"
    shell:

--- a/cluster_config.json
+++ b/cluster_config.json
 {
    "__default__":
    {
-	"threads":"1",
-	"mem":"2G",
-	"time":"01:00:00"
+	"queue":"6hours",
+    "threads":"1",
+	"mem":"4G",
+	"time":"01:00:00",
+    "name":"{rule}.{wildcards}",
+    "out":"{params.cluster_log}/{rule}.{wildcards}-%j-%N.out"
    },
    "collapse_full_UMI_duplicates":
    {
@@ -12,12 +15,14 @@
    "create_STAR_index":
    {
 	"threads": "{threads}",
-	"mem": "{resources.mem}G"
+	"mem": "{resources.mem}G",
+    "time": "06:00:00"
    },
    "mapping":
    {
 	"threads": "{threads}",
-	"mem": "{resources.mem}G"
+	"mem": "{resources.mem}G",
+    "time": "06:00:00"
    },
    "select_by_edit_distance":
    {
@@ -36,6 +41,15 @@
    {
 	"mem": "{resources.mem}G"
    },
+    "pool_samples":
+    {
+    "mem": "{resources.mem}G"
+    },
+    "assign_polyA_signals":
+    {
+    "time":"06:00:00",
+    "mem": "{resources.mem}G"
+    },
    "sample_specific_bg":
    {
 	"mem": "{resources.mem}G"

--- a/config.yaml
+++ b/config.yaml
+---
+################################################################################
+# directories
+################################################################################
+
+# directory, where is/are the input fastq file(s) located
+input_dir: input
+
+# directory, in which the pipeline writes all output files (relative to current directory)
+# (this also includes the summary files)
+results_dir: output
+
+# directory to write the log-files, created during processing, to 
+log_dir: logs
+
+
+################################################################################
+# sample(s) information
+################################################################################
+
+# name(s) of the input fastq file(s)
+# if the sample is named: sample1.fq.gz:
+# please type here: sample1
+# this name is used throughout the entire pipeline as name for the output samples
+samples:
+  - 
+
+
+# read length (i.e. number of sequencing cycles)
+# how long are the reads of this study?
+read_length: 63
+
+
+################################################################################
+# genome information
+################################################################################
+
+# after cloning, the "resource" directory only contains an Ensembl gtf file
+# of the human genome
+gene_annotation: resources/Homo_sapiens.GRCh38.87.gtf
+
+# Have a look in the README file if you want or need to use a different gtf
+
+# indicate the name 3' UTRs are labeled with in the gtf file
+# note: in case 5' and 3' UTRs are labeled with one name:
+#       summaries with the fraction of sites that overlap with UTR features
+#	might include overlaps with 5' UTRs (however, this fractions is
+#	expected to be small)
+utr_name: three_prime_utr
+
+# genomic sequences (again, consult the README to find the command that
+# allows to download the genome in fasta format from Ensembl)
+# after downloading, enter the path and the file name here
+genome: resources/Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa
+
+
+################################################################################
+# read_preprocessing
+################################################################################
+
+#-------------------------------------------------------------------------------
+# adapter trimming
+#-------------------------------------------------------------------------------
+
+# the 5' configuration reads are expected to start with
+# note: "." characters represent any random nucleotide
+# note: The last three nucleotides are required to be three T! The number
+#       of prior random nucleotides is adjustable according to the used
+#       setup
+read_5p_start: ....TTT
+
+# the 3' adapter that is trimmed from the 3' end of reads
+3p_adapter: TGGAATTCTCGGGTGCCAAGG
+
+# the minimum read length, each read must have after adapter trimming (5' and 3')
+# (shorter reads are discarded)
+min_length: 15
+
+#-------------------------------------------------------------------------------
+# valid read parameters
+#-------------------------------------------------------------------------------
+
+# reads are considered as valid based on the following measures
+
+# maximum number of Ns per read/sequence
+maxN: 2
+
+# maximum fraction of As per read/sequence
+maxAcontent: 0.8
+
+# last nucleotide is no A (no paramter value necessary here)
+
+
+################################################################################
+# mapping to 3' end site inference
+################################################################################
+
+#-------------------------------------------------------------------------------
+# STAR mapping
+#-------------------------------------------------------------------------------
+
+# the directory in which the STAR genome index will be stored
+# the first run of the pipeline will generate the index automatically
+# and will store it at the provided directory
+STAR_idx: resources/Homo_sapiens.GRCh38.index
+
+# STAR requires some memory to work efficiently
+# if you plan to run the pipeline on a cluster, the STAR.max.RAM
+# submits the indexing and mapping job with this number of GB of RAM
+STAR.max.RAM: 50
+
+# STAR is much faster when it is run in parallel mode
+# set here the number of available cores STAR is allowed to run on
+threads.STAR: 12
+
+#-------------------------------------------------------------------------------
+# convert BAM to BED
+#-------------------------------------------------------------------------------
+
+# number of threads used for this task
+threads.bam2bed: 8
+
+#-------------------------------------------------------------------------------
+# get initial raw 3' end processing sites
+#-------------------------------------------------------------------------------
+
+# The last x nucleotides at the 3' end of each read should map perfectly to
+# the genomic sequence. This ensures that true 3' end processing sites
+# are inferrred.
+# New: allow 1 mismatch/insertion/deletion in last x nucleotides
+# Set x here:
+min_3p_align: 4
+
+# This parameter allows to exclude mappings to certain chromosomes, for example
+# the Y-chromosome or the mitochondrial chromosme, if desired
+# Note: - chromosome names must match the naming scheme of the used gtf file
+#       - currently, no chromosome is excluded because Y and M are commented out
+excluded_chr:
+#  - M
+#  - Y
+
+#-------------------------------------------------------------------------------
+# assign 3' end sites that originate from internal priming
+#-------------------------------------------------------------------------------
+
+# define the genomic region around 3' end processing sites that is used
+# in order to assess internal priming
+# default: 10 nt downstream are considered to define internal priming events
+upstream_region.IP: 0
+downstream_region.IP: 10
+
+# given the downstream region, set a value for the number of total As and
+# consecutive As to occur such the site is marked
+# as interal priming event
+total_As: 7
+consecutive_As: 6
+
+# define patterns that are indicative of internal priming when they occur
+# directly downstream of the cleavage site
+downstream_patterns:
+
+
+
+################################################################################
+# combine closely space 3' end sites to 3' end clusters
+################################################################################
+
+#-------------------------------------------------------------------------------
+# assign poly(A) signals
+#-------------------------------------------------------------------------------
+
+# the list of poly(A) sites that are annotated for single 3' end processing sites
+# in the region -60 to +10 nt around the cleavage sites
+polyA.signals:
+  - AATAAA
+  - ATTAAA
+  - TATAAA
+  - AGTAAA
+  - AATACA
+  - CATAAA
+  - AATATA
+  - GATAAA
+  - AATGAA
+  - AATAAT
+  - AAGAAA
+  - ACTAAA
+  - AATAGA
+  - ATTACA
+  - AACAAA
+  - ATTATA
+  - AACAAG
+  - AATAAG
+
+#-------------------------------------------------------------------------------
+# define a sample specific background (BG) of site expression
+# for this purpose, run a rough clustering of sites
+# filter out 3' end sites expressed not higher than the sample-specific BG
+#-------------------------------------------------------------------------------
+
+# cutoff, how much percent of clusters are required to have a poly(A) signal
+polyAsignal_cutoff: 90
+# regions, up- and downstream, around highly expressed 3' end sites to
+# determine lower expressed sites in order to cluster them to the higher
+# expressed one
+upstream_bg_clusters: 25
+downstream_bg_clusters: 25
+
+
+#-------------------------------------------------------------------------------
+# cluster together closely spaced 3' end sites
+#-------------------------------------------------------------------------------
+
+# up- and downstream regions to search for sites that get clustered to
+# the current site (similar as above)
+upstream_clusters: 12
+downstream_clusters: 12
+
+#-------------------------------------------------------------------------------
+# merge closely spaced clusters
+#-------------------------------------------------------------------------------
+
+# define a maximum cluster size
+# merged clusters must not be larger than this size (except they share the same
+# poly(A) signals)
+max_cluster_size: 25
+
+# rescue clusters that are potential internal priming events when the most
+# downsteam 3' end site of this cluster has a minimum distance to the next
+# upstream poly(A) signal -> set this minimum distance here
+min_dist_to_PAS: 15
--- a/jobscript.sh
+++ b/jobscript.sh
 #!/bin/sh
 # properties = {properties}

+PATH=[PATH TO CONDA ENV YOU ARE USING AND THAT NEEDS TO BE USED ON SLURM../bin]:${{PATH}}
+export PATH
+
+echo -e "JOB ID\t$JOBID"
+echo "=============================="
+echo -e "rule\t$NAME"
+echo -e "==============================\n"
+
+{exec_job}
--- a/run_slurm.sh
+++ b/run_slurm.sh
+# set -e
+
+snakemake \
+-p \
+--configfile Aseq_config.yaml \
+--cluster-config cluster_config.json \
+--jobscript jobscript.withJOBID.withPATH.sh \
+--cores=256 \
+--local-cores 10 \
+--cluster "sbatch \
+	--cpus-per-task {cluster.threads} \
+	--mem {cluster.mem} \
+	--qos {cluster.queue} \
+	--time {cluster.time} \
+    --job-name={cluster.name} \
+	-o {cluster.out} \
+	-p PARTITON" \
+--latency-wait 60 \
+--reason \
+&>> Aseq.log
--- a/scripts/ag-generate-clusters.pl
+++ b/scripts/ag-generate-clusters.pl
@@ -12,11 +12,12 @@ my $TPM_clusters_gt_1nt = 0;
 my $total_TPM           = 0;
 my %data                = ();
 my @LIBSIZE             = ();
+my @SAMPLE_NAME         = ();
 my $N                   = 0;

 my $upstream;
 my $downstream;
-my $nr_of_samples;
+my $last_sample_col;

 GetOptions(
  "upstream=i"   => \$upstream,
@@ -36,18 +37,19 @@ open( F, "gunzip -c $ARGV[0] |" );
 while (<F>) {
  chomp;
  if ( $_ =~ m/^#(.+)/ ) {
+    # #3;output_dir/SRR3543897.3pSites.ip.bed.gz;2409790.05912655
    my @F = split( /;/, $1 );
+    my @filepath = split( /\//, $F[1] );
+    my @filename = split ( /\./, $filepath[-1] );
+    $SAMPLE_NAME[ $F[0] ] = $filename[0];
    $LIBSIZE[ $F[0] ] = $F[2];
    next;
  }
+
  my @F = split(/[\t\s]+/);
  $N++;

-  if( not defined $nr_of_samples ) {
-    my @tmp_ar =  @F[3 .. ($#F-2)];
-    $nr_of_samples = scalar @tmp_ar;
-  }
-
+  $last_sample_col = $#F -2; # for looping over sample names; 
  my $OVERALL_TPM = 0;
  my $n_supported = 0;
  my @tmp         = ();
@@ -76,8 +78,8 @@ my $ip_candidates = 0;

 # print header line
 print "#chr\tstrand\tstart\tend\trep\ttotal_tpm\trepSite_signals";
-foreach my $i (0 .. ($nr_of_samples - 1)) {
-  print "\tsample_",$i,"_tpm";
+foreach my  $idx ( 3 .. $last_sample_col ) {
+  print "\t",$SAMPLE_NAME[ $idx ],"_tpm";
 }
 print "\n";


--- a/scripts/rs-get-3pEnds-from-bed.pl
+++ b/scripts/rs-get-3pEnds-from-bed.pl
@@ -2,6 +2,8 @@ use strict;
 use warnings;
 use Getopt::Long;

+# Modified 30.04.2021 by Mihaela Zavolan to allow one mismatch in the last x (min_align) nucleotides of the mapped read.
+
 my $help;
 my $debug;
 my $strict;
@@ -28,10 +30,6 @@ if ($showhelp) {
  exit(2);
 }

-if ( $ARGV[0] !~ /gz$/ ) {
-  print STDERR "[ERROR] The input $ARGV[0] is required to be gzipped\n\n";
-  exit(2);
-}

 my %exclude_chr = ();
 if ( defined $exclude) {
@@ -55,7 +53,14 @@ foreach my $strand ( '+', '-' ) {

 my %hash      = ();
 my %hashdebug = ();
-open( F, "gzip -dc $ARGV[0] |" ) or die "Can't open pipe to $ARGV[0]\n";
+
+if ( $ARGV[0] !~ /gz$/ ) {
+  open(F, $ARGV[0]);
+} else {
+  open( F, "gzip -dc $ARGV[0] |" ) or die "Can't open pipe to $ARGV[0]\n";
+}
+
+
 while (<F>) {
  chomp;
  my @F         = split(/\t/);
@@ -64,32 +69,59 @@ while (<F>) {
  next if ( defined $exclude_chr{ $F[0] } );
  if ($strict) {
    my $alignflag = 0;
+
+    #length of pattern for single character difference
+    # format: 15MATIGDCDT12 (15 matches -> 1 Mismatch( A ref, T in read) -> 1 insertion (G in read) -> 1 deletion (C in reference) -> 1 deletion (T in reference) -> 12 matches
+    my $len_scd = 3;
+    
+    #plus strand alignment
    if ( $F[5] eq '+' ) {
-      if ( $F[6] =~ m/^\d+$/ ) {
-        # perfect match: okay
-        $alignflag = 1;
-      } elsif ( $F[6] !~ m/\d+$/ ) {
-        # end badly aligned: skip
-      } else {
-        if ( $F[6] =~ m/(.*\D)(\d+)$/ ) {
-          if ( $2 >= $min_align ) {
-            $alignflag = 1;
-          }
+      if ( $F[6] =~ m/\D*(\d+)$/ ) {
+        my ($m) = $1;
+        # Number of matches at end high enough?
+        if($m >= $min_align) {
+          $alignflag = 1;
+        # if not, there's another chance if only 1 indel/mm follows
+        } elsif ( $F[6] =~ m/\D*(\d+)(\D+)(\d+)$/ ) {
+            my ($m1, $mm, $m2) = ($1, $2, $3);
+            #single indel/mistmatch within the last min_align characters of the alignment
+            if(length($mm) <= $len_scd) {
+              if($m1 + $m2 >= $min_align-1) {
+              $alignflag = 1;
+              }
+            } 
+        }  
+      } elsif ( $F[6] =~ m/\D*(\d+)(\D+)$/ ) {
+        my ($m, $mm) = ($1, $2);
+        # single indel/mismatch at the end of the alignment
+        if(length($mm) <= $len_scd && $m >= $min_align-1) {
+          $alignflag = 1;
        }
      }
+    #minus strand alignment
    } else {
-      if ( $F[6] =~ m/^\d+$/ ) {
-        # perfect match: okay
-        $alignflag = 1;
-      } elsif ( $F[6] !~ m/^\d+/ ) {
-        # end badly aligned: skip
-      } else {
-        if ( $F[6] =~ m/^(\d+)(\D.*)/ ) {
-          if ( $1 >= $min_align ) {
+      if ( $F[6] =~ m/^(\d+)\D*/ ) {
+        my ($m) = $1;
+        # Number of matches at end high enough?
+        if($m >= $min_align) {
+          $alignflag = 1;
+        # If not, there's another chance if only 1 indel/mm follows
+        } elsif ( $F[6] =~ m/^(\d+)(\D+)(\d+)\D*/ ) {
+          my ($m1, $mm, $m2) = ($3, $2, $1);
+          #single indel/mistmatch within the last min_align characters of the alignment
+          if(length($mm) <= $len_scd) {
+            if($m1 + $m2 >= $min_align-1) {
            $alignflag = 1;
+            }
          }
        }
-      }
+      } elsif ( $F[6] =~ m/^(\D+)(\d+)\D*/ ) {
+        my ($m, $mm) = ($2, $1);
+        # single indel/mismatch at the end of the alignment
+        if(length($mm) <= $len_scd && $m >= $min_align-1) {
+          $alignflag = 1;
+        }
+      } 
    }
    next if ( $alignflag == 0 );
  }