From 380bc1d38c03a41e8dbfeedbf6b432403dde660d Mon Sep 17 00:00:00 2001
From: schmiral <ralf.schmidt@unibas.ch>
Date: Wed, 20 Dec 2017 11:29:08 +0100
Subject: [PATCH] updated relative usage inference script to include the
 reporting of single distal sites

---
 .gitignore                                    |    4 +
 PAQR/part_two.Snakefile                       |   17 +-
 ...-usage-inference.single_distal_included.py | 2278 +++++++++++++++++
 3 files changed, 2295 insertions(+), 4 deletions(-)
 create mode 100644 PAQR/scripts/deNovo-used-sites-and-usage-inference.single_distal_included.py

diff --git a/.gitignore b/.gitignore
index 7404340..67d0ce9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,8 +12,12 @@ PAQR/.snakemake/
 PAQR/scripts/deNovo-used-sites-and-usage-inference.testVersion.py
 KAPAC/doIt.sh
 KAPAC/RESULTS/
+KAPAC/RESULTS_FORMER.log
 KAPAC/RESULTS_FORMER/
+KAPAC/RESULTS_FORMER_AFTER_CLEANUP/
 KAPAC/RESULTS.log
+# Rhistory
+*.Rhistory
 KAPAC/DATA/kmer_counts_all.tsv
 # exclude the gitignore as well
 .gitignore
diff --git a/PAQR/part_two.Snakefile b/PAQR/part_two.Snakefile
index b07360d..732f7b7 100644
--- a/PAQR/part_two.Snakefile
+++ b/PAQR/part_two.Snakefile
@@ -83,6 +83,7 @@ rule infer_relative_usage:
     output:
         relUse = "{study}/relative_usages.tsv",
 	expressions = "{study}/tandem_pas_expressions.tsv",
+        distal_sites = "{study}/single_distal_sites.tsv"
         header = "{study}/relative_usages.header.out"
     params:
         script_dir = config['dir.scripts'],
@@ -115,9 +116,8 @@ rule infer_relative_usage:
         conds_string = " ".join(conds)
         extens_string = " ".join(extens)
         shell('''
-        {params.py2_env_path}/py2_paqr/bin/python {params.script_dir}/deNovo-used-sites-and-usage-inference.py \
+        {params.py2_env_path}/py2_paqr/bin/python {params.script_dir}/deNovo-used-sites-and-usage-inference.single_distal_included.py \
         --verbose \
-	--expressions_out {output.expressions} \
         --clusters {params.clusters} \
         --coverages {input.coverages} \
         --conditions {conds_string} \
@@ -131,11 +131,12 @@ rule infer_relative_usage:
         --best_break_point_upstream_extension {params.valid_region_upstream_of_cluster_to_locate_globalBestBreakPoint_in} \
         --processors {threads} \
         --max_downstream_coverage {params.max_downstream_cov_relative_to_start} \
+        --expressions_out {output.expressions} \
+        --distal_sites {output.distal_sites} \
         > {output.relUse} \
         2> {log}
         ''')
 
-
 #-------------------------------------------------------------------------------
 # create design file for polyA-MARA
 #-------------------------------------------------------------------------------
@@ -172,7 +173,8 @@ rule design_file:
 rule tpm_normalize:
     ##LOCAL##
     input:
-        expression = "{study}/tandem_pas_expressions.tsv"
+        expression = "{study}/tandem_pas_expressions.tsv",
+        distal_sites = "{study}/single_distal_sites.tsv"
     output:
         tpm_expr = "{study}/tandem_pas_expressions.rpm.tsv"
     run:
@@ -188,6 +190,13 @@ rule tpm_normalize:
                         continue
                     libsizes[ i - 10 ] += float(F[i])
 
+        with open(input.distal_sites, "r") as distal_in:
+            for line in distal_in:
+                for i in range(10, len(F)):
+                    if float(F[i]) == -1:
+                        continue
+                    libsizes[ i - 10 ] += float(F[i])
+
         with open(output.tpm_expr, "w") as ofile:
             with open(input.expression, "r") as ifile:
                 for line in ifile:
diff --git a/PAQR/scripts/deNovo-used-sites-and-usage-inference.single_distal_included.py b/PAQR/scripts/deNovo-used-sites-and-usage-inference.single_distal_included.py
new file mode 100644
index 0000000..5deeeb6
--- /dev/null
+++ b/PAQR/scripts/deNovo-used-sites-and-usage-inference.single_distal_included.py
@@ -0,0 +1,2278 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Oct 30 13:30:36 2016
+
+Define poly(A) sites and calculate relative usages parallelized
+
+@author: schmiral
+"""
+__date__ = "Wed Oct 05 13:30:36 2016"
+__author__ = "Ralf Schmidt"
+__email__ = "ralf.schmidt@unibas.ch"
+__license__ = "GPL"
+
+
+# imports
+import sys
+import os
+import time
+import HTSeq
+from argparse import ArgumentParser, RawTextHelpFormatter
+import numpy as np
+import cPickle
+import datetime
+import multiprocessing
+import bisect
+
+parser = ArgumentParser(description=__doc__, formatter_class=RawTextHelpFormatter)
+parser.add_argument("-v",
+                    "--verbose",
+                    dest="verbose",
+                    action="store_true",
+                    default=False,
+                    help="Be loud!")
+                    
+parser.add_argument("--clusters",
+                    dest="polyA",
+                    help=("file name providing the poly(A) " +
+                        "sites in extended BED file format") )
+
+parser.add_argument("--coverages",
+                    dest="coverages",
+                    nargs="*",
+                    help=("name of the pickle files that " +
+                        "store the HTSeq objects with coverages") )
+
+parser.add_argument("--conditions",
+                    dest="conditions",
+                    nargs="*",
+                    help=("values that match the coverages " +
+                        "parameters and indicate the condition each " +
+                        "coverage object belongs to") )
+                    
+parser.add_argument("--read_length", 
+                    dest="read_length",
+                    type=int,
+                    default=100,
+                    help=("integer value for the expected read length " +
+                        "(used for window to fit a regression line to the " +
+                        "upstream region of poly(A) sites) [default: 100]") )
+
+parser.add_argument("--min_coverage_region",
+                    dest="min_coverage_region",
+                    type=int, 
+                    default=100,
+                    help=("integer value that defines the minimum region " +
+                        "that needs to be available to calculate a mean " +
+                        "coverage and the mean squared error [default: 100]") )
+
+parser.add_argument("--min_mean_coverage",
+                    dest="min_mean_coverage",
+                    type=float, 
+                    default=20.0,
+                    help=("float value that defines the minimum mean " +
+                        "coverage required for an exon to be considered " +
+                        "[default: 20.0]") )
+
+parser.add_argument("--min_cluster_distance",
+                    dest="min_cluster_distance",
+                    type=int, 
+                    default=200,
+                    help=("integer value that defines the distance till " +
+                        "which clusters are merged and analyzed separately " +
+                        "[default: 200]") )
+                    
+parser.add_argument("--mse_ratio_threshold",
+                    dest="mse_ratio_threshold",
+                    type=float,
+                    default=0.7,
+                    help=("the ratio of the mse after including a new break " +
+                        "point devided by the mse before must be below this " +
+                        "threshold [default: 0.7]") )
+                    
+parser.add_argument("--best_break_point_upstream_extension",
+                    dest="best_break_point_upstream_extension",
+                    type=int,
+                    default=200,
+                    help=("nucleotide extension added to the cluster's " +
+                        "upstream end during the check for global break " +
+                        "points [default: 200]") )
+                    
+parser.add_argument("--ds_reg_for_no_coverage",
+                    dest="ds_reg_for_no_coverage",
+                    type=int, 
+                    default=200,
+                    help=("integer value that defines the region that is " +
+                        "downstream of the exon and necessary to search for " +
+                        "zero coverage downstream of the distal site  [default: 200]") )
+
+parser.add_argument("--max_downstream_coverage",
+                    dest="max_downstream_coverage",
+                    default=5,
+                    type=float,
+                    help=("<FLOAT> maximum percentage of the coverage at the exon " +
+                          "start is allowed for the mean coverage in the downstream " +
+                          "region of a valid distal site [default: 5]") )
+
+parser.add_argument("--processors",
+                    dest="proc",
+                    type=int,
+                    default=1,
+                    help="<INT> Number of processors used [default: 1]")
+
+parser.add_argument("-d",
+                    "--debug",
+                    dest="debug",
+                    action="store_true",
+                    default=False,
+                    help="Print additional diagnostic comments for single exons")
+
+parser.add_argument("--ex_extension_files",
+                    dest="ex_extension_files",
+                    nargs="*",
+                    help=("name of the files that " +
+                        "store the upstream extensions per exon for the corresponding sample") )
+
+parser.add_argument("--expressions_out",
+                    dest="expressions_out",
+                    help="name of file to write the raw expression values per site per sample to")
+
+parser.add_argument("--distal_sites",
+                    dest="distal_sites",
+                    help="file name to which the exons are written that have only distal site usage\n")
+
+#parser.add_argument("--min_r_squared", 
+#                    dest="r_sq",
+#                    type=float,
+#                    default = 0.2,
+#                    help="r-squared threshold for the regression line fit to the coverage [default: 0.2]")
+
+# redefine a functions for writing to stdout and stderr to save some writting
+syserr = sys.stderr.write
+sysout = sys.stdout.write
+
+np.seterr(all='raise')
+
+def time_now():#return time
+    curr_time = datetime.datetime.now()
+    return curr_time.strftime("%c")            
+    
+def find_distal_site(site,
+                     ex_start,
+                     strand, 
+                     region, 
+                     merged_exons,
+                     max_downstream_coverage,
+                     read_length, 
+                     ds_reg_extend,
+                     debug):
+    
+    """Check if the passed poly(A)site is applicable as distal site
+    
+    Use the combined coverage of all samples here"""
+
+    #----------------------------------------
+
+    # define the exon start reference region
+    # get the mean coverage of this region
+    # (ex_start saves the strand specific exon start)
+    if strand == "+":
+        ex_start_idx = region.index(ex_start)
+
+        if ex_start + read_length > site[1].start:
+            # consistency check:
+            # is exon start more upstream than site start
+            if site[1].start < ex_start:
+                syserr("[INFO] Distal site search stopped whith the current site %s that overlaps the exon start\n"
+                       % site[0] )
+                return(None, -1)
+
+            # exon is very short: use the start of the currently investigated
+            # poly(A) site as boundary
+            ex_start_2_idx = region.index(site[1].start)
+        else:
+            ex_start_2_idx = region.index(ex_start + read_length)
+    else:
+        if ex_start - read_length < site[1].end:
+            # consistency check:
+            # is exon start more upstream than site start (meaning the strand specific starts)
+            if site[1].end >= ex_start:
+                syserr("[INFO] Distal site search stopped whith the current site %s that overlaps the exon start\n"
+                       % site[0] )
+                return(None, -1)
+            
+            # exon is very short: use the start of the currently investigated
+            # poly(A) site as boundary
+            ex_start_idx = region.index( site[1].end)
+        else:
+            ex_start_idx = region.index(ex_start - read_length)
+            
+        ex_start_2_idx = region.index(ex_start - 1) + 1
+
+    if ex_start_2_idx - ex_start_idx <= 0:
+        syserr("[INFO] Distal site search stopped whith the current site %s that overlaps the exon start\n"
+               % site[0])
+        return(None, -1)
+        
+    ex_start_cov = np.mean(merged_exons[ ex_start_idx:ex_start_2_idx ])
+
+    if ex_start_cov == 0:
+        if debug:
+            syserr( ("[INFO] No coverage at exon start of site %s\n" +
+                     "[INFO] Found during search for distal site. Exon is skipped\n")
+                    % site[0] )
+        return (None, -1)
+
+    #----------------------------------------
+
+    # define the upstream reference region
+    # get the mean coverage of this region
+
+    if strand == "+":
+        if site[1].start - (2 * read_length) < ex_start:
+            # exon is very short
+            # use the exon start as ref-region start
+            ex_us_idx = region.index(ex_start)
+        else:
+            ex_us_idx = region.index( site[1].start - (2 * read_length) )
+        ex_us_2_idx = region.index( site[1].start )
+    else:
+        ex_us_idx = region.index( site[1].end )
+        if site[1].end + (2 * read_length) >= ex_start:
+            # exon is very short
+            # use the exon start as ref-region start
+            ex_us_2_idx = region.index(ex_start - 1) + 1
+        else:
+            ex_us_2_idx = region.index( site[1].end + (2 * read_length) )
+    ex_us_cov = np.mean(merged_exons[ ex_us_idx:ex_us_2_idx ])
+
+    #----------------------------------------
+
+    # define the downstream reference region
+    # get the mean coverage of this region
+
+    if strand == "+":
+        ex_ds_idx = region.index(site[1].end )
+        # ex_ds_2_idx = region.index( site[1].end + (2 * read_length) )
+        ex_ds_2_idx = region.index( site[1].end + (ds_reg_extend) - 1 ) + 1
+        ex_ds_cov = np.mean( merged_exons[ ex_ds_idx:ex_ds_2_idx] )
+    else:
+        # ex_ds_idx = region.index( site[1].start - (2 * read_length) )
+        ex_ds_idx = region.index( site[1].start - (ds_reg_extend) )
+        ex_ds_2_idx = region.index( site[1].start )
+        ex_ds_cov = np.mean(merged_exons[ ex_ds_idx:ex_ds_2_idx ])
+
+    #----------------------------------------
+
+    # a valid distal site has a higher us_mean_cvg than ds_mean_cvg
+    # and the percentage of the ds_mean_cvg with respect to
+    # the start_mean_cvg is below the threshold
+    # (threshold defines the maximum percentage the ds_region is allowed to have)
+
+    if( ex_us_cov > ex_ds_cov and
+        ex_ds_cov / float(ex_start_cov) * 100 <= max_downstream_coverage):
+        # distal site found
+        return(1, 1)
+    else:
+        # do not use this site as distal site
+        return (None, 0)
+
+def assess_minimum_cvg(exon_structure, bp_coverages_dict, region, min_mean_cvg):
+    '''Iterate over the dict with coverages and assess
+    for every single sample if it has sufficient mean covearge
+    at the current exon
+    '''
+
+    return_dict = {}
+    
+    start = exon_structure[2]
+    end = exon_structure[3]
+
+    # infer the indices for the current exon boundaries
+    start_idx = region.index( start )
+    try:
+        end_idx = region.index( end )
+    except ValueError:
+        # for negative strand sites:
+        # exon end may not be accessible as position
+        end_idx = region.index( end - 1) + 1
+
+    for cond in bp_coverages_dict:
+        return_dict[ cond ] = []
+        for cvg in bp_coverages_dict[cond]:
+            if np.mean( cvg[start_idx:end_idx] ) < min_mean_cvg:
+                return_dict[ cond ].append(0)
+            else:
+                return_dict[ cond ].append(1)
+
+    return return_dict
+
+def find_best_break_point_in_full_exon(bp_cvg_dict,
+                                       valid_mean_cvg_dict,
+                                       region,
+                                       strand,
+                                       ex_start,
+                                       ex_end,
+                                       min_cvg_reg,
+                                       mse_ratio_threshold,
+                                       upstream_extension):
+    '''this function is only used when only a distal site
+    was found and the upstream coverage is screened for
+    unknown sites'''
+
+    # store for which samples a valid break point was inferred
+    valid_break_point_dict = {}
+
+    # prepare the arrays for the search of the best separation point
+    curr_ex_start_dict = {}
+    curr_ex_end_dict = {}
+    # infer sample-specific exon boundaries
+    # (this is necessary due to the individual upstream extension)
+    for cond in bp_cvg_dict:
+        curr_ex_start_dict[cond]=[]
+        curr_ex_end_dict[cond]=[]
+        for cvg_idx in range(len(bp_cvg_dict[cond])):
+            # extend the exon start by the upstream extension
+            curr_add_us_ext = upstream_extension[cond][cvg_idx]
+            if strand == "+":
+                # use the extended exon as start pos
+                curr_ex_start_dict[cond].append( ex_start - curr_add_us_ext )
+                curr_ex_end_dict[cond].append( ex_end )
+            else:
+                # minus strand
+                # extend the exon at the other end
+                    curr_ex_start_dict[cond].append( ex_start )
+                    curr_ex_end_dict[cond].append( ex_end + curr_add_us_ext )
+
+    for cond in bp_cvg_dict:
+        valid_break_point_dict[cond] = []
+        for cvg_idx in range(len(bp_cvg_dict[cond])):
+
+            cvg = bp_cvg_dict[cond][cvg_idx]
+
+            # if the current sample has no valid mean cvg:
+            # store False in valid_break_point_dict
+            if valid_mean_cvg_dict[cond][cvg_idx] == 0:
+                valid_break_point_dict[cond].append(False)
+                continue
+
+            # initialize the best mse with stupidly high value
+            curr_best_ratio = 10
+
+            # set of indices for the region
+            try:
+                upstream_start_idx = region.index( curr_ex_start_dict[cond][cvg_idx] )
+            except ValueError:
+                syserr("[ERROR] Upstream start pos not in list while scanning for overall break point\n")
+                syserr("[ERROR] Upstream start: %i\n" 
+                       %( curr_ex_start_dict[cond][cvg_idx] ) )
+                raise ValueError("[ERROR] Interrupt due to missing position\n")
+
+            try:
+                downstream_end_idx = region.index( curr_ex_end_dict[cond][cvg_idx] - 1 ) + 1
+            except ValueError:
+                syserr("[ERROR] Downstream end pos not in list\n")
+                syserr("[ERROR] Downstream end: %i\n" 
+                       %( curr_ex_end_dict[cond][cvg_idx] ) )
+                raise ValueError("[ERROR] Interrupt due to missing position\n")
+
+            # global mse
+            cvg_global = cvg[upstream_start_idx:downstream_end_idx]
+            global_mse = np.mean( (cvg_global - np.mean(cvg_global))**2 )
+
+            best_pos_idx = None
+            for pos_idx in range(upstream_start_idx + min_cvg_reg + 1, downstream_end_idx - min_cvg_reg - 1):
+                # get mse for the current two parts
+                cvg_right = cvg[pos_idx + 1: downstream_end_idx]
+                cvg_left = cvg[upstream_start_idx:pos_idx]
+                cov_diff = cvg_left - np.mean(cvg_left)
+                cov_diff = np.append( cov_diff, (cvg_right - np.mean(cvg_right) ) )
+                curr_mse = np.mean( cov_diff**2 )
+                mse_ratio = float(curr_mse) / global_mse
+                if mse_ratio < curr_best_ratio:
+                    curr_best_ratio = mse_ratio
+
+            if curr_best_ratio < mse_ratio_threshold:
+                valid_break_point_dict[cond].append(True)
+            else:
+                valid_break_point_dict[cond].append(False)
+
+    return valid_break_point_dict
+    
+def check_mse_improvement(bp_coverages_dict,
+                          valid_mean_cvg_dict,
+                          region, 
+                          site, 
+                          strand, 
+                          segment_start,
+                          segment_end,
+                          min_coverage_region, 
+                          best_break_point_upstream_extension, 
+                          mse_ratio_threshold, 
+                          upstream_extension, 
+                          prev_mse,
+                          extended_prev_mse=None):
+    """check if current poly(A) site improves the mean squared error"""
+    
+    us_mse_dict = {}
+    ds_mse_dict = {}
+    mse_ratio_dict = {}
+
+    segment_starts = {}
+    segment_ends = {}
+
+    # for both strands: iterate over region from 3' to 5'
+    fit_region_to_iterate = []
+    fit_region_start = site[1].start
+    fit_region_end = site[1].end
+
+    if strand == "+" :
+        fit_region_start -= best_break_point_upstream_extension
+        fit_region_to_iterate = range( fit_region_start, fit_region_end )[::-1]
+    else:
+        fit_region_end += best_break_point_upstream_extension
+        fit_region_to_iterate = range( fit_region_start, fit_region_end )
+
+    for cond in bp_coverages_dict:
+        us_mse_dict[cond] = []
+        ds_mse_dict[cond] = []
+        mse_ratio_dict[cond] = []
+        segment_starts[cond] = []
+        segment_ends[cond] = []
+        
+        cvg_idx = -1
+        for cvg in bp_coverages_dict[cond]:
+            cvg_idx += 1
+            curr_best_ratio = 10
+
+            # store whether the exon was already extended umstream
+            # (artificial exon upstream extension only possible once per site)
+            already_extended = False
+
+            # if the current sample has no valid mean cvg:
+            # append 10 as curr_best_ratio and continue
+            if valid_mean_cvg_dict[cond][cvg_idx] == 0:
+                mse_ratio_dict[cond].append(curr_best_ratio)
+                # append a placeholder value also as segment start/end (strand dependent)
+                if strand == "+":
+                    segment_starts[cond].append(0)
+                else:
+                    segment_ends[cond].append(0)
+                continue
+
+            # set up up- and downstream region
+            # (depending on the strand, some values are "None" still afterwards)
+            if strand == "+":
+                up_start = segment_start
+                down_end = segment_end
+                # store the upstream starts in case these are extended in the next for loop
+                segment_starts[cond].append(up_start)
+            else:
+                down_start = segment_start
+                up_end = segment_end
+                # store the upstream end (aka strand specific starts) in case these are extended in the next for loop
+                segment_ends[cond].append(up_end)
+                
+            prev_reg_mse = prev_mse[cond][cvg_idx]
+            if prev_reg_mse == 0:
+                # skip this coverage because for this region it is already 
+                # completely flat
+                syserr( ("[INFO] Skip cond %s cvg %i. Found previous reg " + 
+                       "to have zero mse for site %s!\n")
+                       % (cond, cvg_idx, str(site[0])) )
+                # no reasonable position was found: still the starting value of 10
+                # is appended as curr_best_ratio
+                mse_ratio_dict[cond].append(curr_best_ratio)
+                continue
+
+            for pos in fit_region_to_iterate:
+                
+                # leave out the current pos (
+                # pos does not belong to the upstream nor to the downstream region)
+                if strand == "+":
+                    up_end = pos
+                    down_start = pos + 1
+                else:
+                    up_start = pos + 1
+                    down_end = pos
+
+                if(up_end - 
+                    up_start < min_coverage_region):
+                    # upstream region is too small
+                    # if currently the proximal site is considered
+                    # the additional upstream extension of coverage is exploited
+                    # current site is the most proximal == extended_prev_mse != None
+                    if extended_prev_mse is not None and not already_extended:
+                        # extended upstream region is used, hence, 
+                        # the extended previous mse should be used as well
+                        prev_reg_mse = extended_prev_mse[cond][cvg_idx]
+
+                        if strand == "+":
+                            up_start -= upstream_extension[cond][cvg_idx]
+                            segment_starts[cond][cvg_idx] = up_start
+                        else:
+                            up_end += upstream_extension[cond][cvg_idx]
+                            segment_ends[cond][cvg_idx]= up_end
+                        already_extended = True
+
+                        if(up_end - 
+                            up_start < min_coverage_region):
+                            # syserr(("[INFO] No upstream region for %s at pos " +
+                            #         "%i and potential more upstream positions " +
+                            #         "even though upstream extension of the exon. " +
+                            #         "Skip this and more upstream positions.\n") 
+                            #         % (site[0], pos))
+                            break
+                    else:
+                        # syserr(("[INFO] No upstream region for internal pA %s at pos %i. " +
+                        #         "Skip this and more upstream positions.\n")
+                        #         % (site[0], pos))
+                        break
+                if(down_end - 
+                    down_start <= min_coverage_region):
+                    syserr("[INFO] No downstream region for %s at pos %i\n" 
+                            % (site[0], pos))
+                    break
+
+                ### Finished definition of indexes
+
+                ### set up upstream region
+                try:
+                    up_start_idx = region.index(up_start)
+                except ValueError:
+                    syserr("[ERROR] upstream start pos not in list\n")
+                    syserr("[ERROR] Upstream start: %i, upstream stop: %i, diff: %i, min_diff: %i\n" 
+                           %( up_start, up_end, up_end - up_start, min_coverage_region) )
+                    raise ValueError("[ERROR] Interrupt due to missing position\n")
+
+                try:
+                    up_end_idx = region.index(up_end)
+                except ValueError:
+                    # for negative strand sites:
+                    # exon end not may not be accessible as position
+                    up_end_idx = region.index(up_end - 1) + 1
+
+                cvg_upstream = cvg[up_start_idx:up_end_idx]
+                mean_upstream = np.mean( cvg_upstream )
+
+                ### set up downstream region
+                down_start_idx = region.index(down_start)
+                down_end_idx = region.index(down_end)
+
+                cvg_downstream = cvg[down_start_idx:down_end_idx]
+                mean_downstream = np.mean( cvg_downstream )
+
+                # continue if the downstream mean is higher than the upstream mean
+                if mean_downstream > mean_upstream:
+                    continue
+
+                # get a mean squared error of up- and downstream region
+                cov_diff = cvg_downstream - mean_downstream
+                cov_diff = np.append( cov_diff, cvg_upstream - mean_upstream )
+                curr_mse = np.mean( cov_diff**2)
+                
+                mse_ratio = float(curr_mse) / prev_reg_mse
+                if mse_ratio < curr_best_ratio:
+                    curr_best_ratio = mse_ratio
+
+            # if no reasonable position was found: still the starting value of 10
+            # is appended as curr_best_ratio
+            mse_ratio_dict[cond].append(curr_best_ratio)
+
+    # # test
+    # syserr("[tmp] inferred mse_ratios for current site %s: %s\n" % (site[0], str(mse_ratio_dict)))
+                
+    # check if site has valid mse improvement in any of the conditions
+    # for this the median mse ratio must meet the threshold criterion
+    valid_conds = []
+    for cond in mse_ratio_dict:
+        # only consider samples with valid mean_cvg
+        mse_ratio_subset = np.array([mse_ratio_dict[cond][i] for i in range(len( mse_ratio_dict[cond] ) ) if valid_mean_cvg_dict[cond][i] == 1])
+        if len(mse_ratio_subset) == 0:
+            continue
+        
+        # different possibilities a condition is considered as "valid":
+        # two samples or less: both must be below the threshold
+        # three to five samples: the median must be below the threshold
+        # six or more samples: at least three samples must be below th threshold
+        if len(mse_ratio_subset) <= 2:
+            if sum( mse_ratio_subset < mse_ratio_threshold ) == len(mse_ratio_subset):
+                valid_conds.append(cond)
+        elif len(mse_ratio_subset) <= 5:
+            if len(mse_ratio_subset) % 2 == 1 and np.median( mse_ratio_subset ) < mse_ratio_threshold:
+                valid_conds.append(cond)
+            elif len(mse_ratio_subset) % 2 == 0 and np.median( np.append(mse_ratio_subset,0) ) < mse_ratio_threshold:
+                # the number of samples is even, therefore a zero is appended temporarily
+                # this ensures that the median corresponds to half of the samples
+                valid_conds.append(cond)
+        else:
+            if sum( mse_ratio_subset < mse_ratio_threshold) >= 3:
+                valid_conds.append(cond)
+            
+    if len(valid_conds) > 0:
+        
+        # potential site found
+        # complete infos for mse dicts
+        # define the new up_mse and ds_mse values based on 
+        # the poly(A) site coordinates
+        for cond in bp_coverages_dict:
+            cvg_idx = -1
+            for cvg in bp_coverages_dict[cond]:
+
+                cvg_idx += 1
+
+                if valid_mean_cvg_dict[cond][cvg_idx] == 0:
+                    us_mse_dict[cond].append(100)
+                    ds_mse_dict[cond].append(100)
+                    continue
+
+                # calculate the missing up- and downstream mses for the coverages
+                # exclude the region of the poly(A) site cluster when calculating
+                # the mses
+                if strand == "+":
+                    up_start_idx = region.index(segment_starts[cond][cvg_idx])
+                    up_end_idx = region.index(site[1].start)
+                    down_start_idx = region.index(site[1].end)
+                    down_end_idx = region.index(segment_end)
+                else:
+                    up_start_idx = region.index(site[1].end)
+                    try:
+                        up_end_idx = region.index(segment_ends[cond][cvg_idx])
+                    except ValueError:
+                        # for negative strand sites:
+                        # exon end may not be accessible as position
+                        up_end_idx = region.index(segment_ends[cond][cvg_idx] - 1) + 1
+                    down_start_idx = region.index(segment_start)
+                    down_end_idx = region.index(site[1].start)
+
+                cvg_upstream = cvg[up_start_idx:up_end_idx]
+                mean_upstream = np.mean(cvg_upstream)
+                us_mse = np.mean((cvg_upstream - mean_upstream)**2)
+
+                cvg_downstream = cvg[down_start_idx:down_end_idx]
+                mean_downstream = np.mean( cvg_downstream )
+                ds_mse = np.mean((cvg_downstream - mean_downstream)**2)
+
+                us_mse_dict[cond].append(us_mse)
+                ds_mse_dict[cond].append(ds_mse)
+
+    return (mse_ratio_dict, us_mse_dict, ds_mse_dict, valid_conds)
+
+def check_global_best_breaking_point(bp_cvg,
+                                     used_sites_positions,
+                                     start_idx,
+                                     end_idx,
+                                     min_coverage_region):
+
+    """Find best breaking point irrespective of poly(A) sites.
+    Function to process single sites."""
+
+    cvg_global = bp_cvg[start_idx:end_idx]
+    global_mse = np.mean( (cvg_global - np.mean(cvg_global))**2 )
+    smallest_mse = global_mse
+    best_pos_idx = None
+    for pos_idx in range(start_idx + min_coverage_region, end_idx - min_coverage_region):
+        # get mse for the current two parts
+        cvg_right = bp_cvg[pos_idx + 1: end_idx]
+        cvg_left = bp_cvg[start_idx:pos_idx]
+        cov_diff = cvg_left - np.mean(cvg_left)
+        cov_diff = np.append( cov_diff, (cvg_right - np.mean(cvg_right) ) )
+        curr_mse = np.mean( cov_diff**2 )
+        if curr_mse < smallest_mse:
+            smallest_mse = curr_mse
+            best_pos_idx = pos_idx
+
+    if best_pos_idx is not None:
+                
+        if best_pos_idx not in used_sites_positions:
+            # break point is outside of regions that are poly(A)-site supported
+            return(0, best_pos_idx)
+
+        else:
+            # valid break point found
+            return( best_pos_idx, best_pos_idx )
+
+    # no break point was found; return "None"
+    return( best_pos_idx, best_pos_idx )
+    
+def check_global_best_breaking_points(bp_cvg,
+                                      cond,
+                                      sample_idx,
+                                      region,
+                                      used_sites_positions,
+                                      still_undetected,
+                                      start_idx,
+                                      end_idx,
+                                      min_coverage_region):
+    """Find best breaking points irrespective of poly(A) sites.
+    Wrapper for the entire exon."""
+    
+    boundary_indices = [start_idx, end_idx]
+    while any(x is not None for x in still_undetected):
+        check_curr_segment = False
+        for segment_idx in range(1,len(boundary_indices)):
+            for idx in range(boundary_indices[segment_idx -1], boundary_indices[segment_idx]):
+                if idx in used_sites_positions and still_undetected[used_sites_positions[idx]] is not None:
+                    check_curr_segment = True
+                    break
+            if check_curr_segment:
+                break
+
+        
+        # conds_to_check = set()
+        # for segment_idx in range(1,len(boundary_indices)):
+        #     # get the conditions for which poly(A) sites that act as breaking points were inferred
+        #     conds_to_check = set()
+        #     samples_to_consider = {}
+        #     for idx in range(boundary_indices[segment_idx -1], boundary_indices[segment_idx]):
+        #         if idx in used_sites_positions and still_undetected[used_sites_positions[idx]] is not None:
+        #             conds_to_check.update(still_undetected[used_sites_positions[idx]][3])
+        #             # iterate over the supporting_cvgs dict of the current site
+        #             for cond in still_undetected[used_sites_positions[idx]][4]:
+        #                 if cond not in samples_to_consider:
+        #                     samples_to_consider[cond] = still_undetected[used_sites_positions[idx]][4][cond]
+        #                 else:
+        #                     # update the currently stored supporting_cvgs for cond
+        #                     # include so far missing samples
+        #                     for i in range(len( samples_to_consider[cond] )):
+        #                         if still_undetected[used_sites_positions[idx]][4][cond][i]:
+        #                             samples_to_consider[cond][i] = True
+
+        #     if len(conds_to_check) > 0:
+        #         break
+        # if len(conds_to_check):
+
+            
+        # use the current borders to find the best breaking point
+        (pos_idx_indicator, pos_idx) = check_global_best_breaking_point(bp_cvg,
+                                                                        used_sites_positions,
+                                                                        boundary_indices[segment_idx -1],
+                                                                        boundary_indices[segment_idx],
+                                                                        min_coverage_region)
+        if pos_idx_indicator == 0:
+            # syserr("[ERROR] Break point outside of poly(A) site regions\n")
+            return (boundary_indices, pos_idx)
+        elif pos_idx_indicator is None:
+            # syserr("[ERROR] No valid break point found")
+            return (boundary_indices, pos_idx)
+
+        else:
+            # test
+            #if still_undetected[used_sites_positions[pos_idx]] is not None:
+            #    rep_site = still_undetected[used_sites_positions[pos_idx]][0]
+            #    pos_idx = region.index( int(rep_site.split(":")[2]) )
+            #syserr("[INFO] Inferred break point: %i\n" % pos_idx)
+            bisect.insort(boundary_indices, pos_idx)
+            still_undetected[used_sites_positions[pos_idx]] = None
+
+        # else:
+        #     syserr(("[ERROR] No sites found anymore even though still " +
+        #             "undetected sites are available. undetected_sites: %s\n") 
+        #             % str(still_undetected))
+        #     syserr("[ERROR] Current boundary indices: %s\n" % str(boundary_indices))
+        #     return "Error"
+            
+    return "Finished"
+    
+def find_sites_with_usage(sites,
+                          segment_start,
+                          segment_end,
+                          segment_mse,
+                          original_extended_segment_mse,
+                          bp_coverages_dict,
+                          valid_mean_cvg_dict,
+                          region,
+                          strand,
+                          min_coverage_region,
+                          best_break_point_upstream_extension, 
+                          mse_ratio_threshold, 
+                          add_us_extension):
+    """iterate over the poly(A) sites
+    and find the best poly(A) site
+    """
+
+    # keep the original value of extended_segment_mse untouched
+    # to know  whether it needs to be updated below in the code
+    extended_segment_mse = original_extended_segment_mse
+
+    top_site = None
+    top_site_idx = None
+    top_site_prot_support = 0
+    top_mse_ratio = 1
+    top_us_mse_dict = {}
+    top_ds_mse_dict = {}
+    top_site_valid_conditions = []
+    top_site_supporting_cvgs = {}
+    top_site_is_from_cluster = None
+    
+    for site_idx in range(len(sites)):
+        site = sites[site_idx]
+
+        if isinstance(site[0], list):
+            # set the extended_segment_mse to None if the most proximal
+            # site has enough distance to the exon start
+            # (extended_segment_mse might be None anyways already 
+            #  if the current segment is a subsegment of the exon that
+            #  is not most proximally located)
+            if( (strand == "+" and (site[0][1].start
+                                    - segment_start
+                                    - best_break_point_upstream_extension) >= min_coverage_region) or
+                (strand == "-" and (segment_end
+                                    - site[0][1].end
+                                    - best_break_point_upstream_extension) >= min_coverage_region)):
+                extended_segment_mse = None
+
+            # process a set of closely spaced clusters
+            best_candidate_ratio = 1
+            best_candidate_prot_support = 0
+            best_candidate_up_mse_values = []
+            best_candidate_ds_mse_values = []
+            best_candidate_site = None
+            best_candidate_valid_conditions = []
+            best_candidate_supporting_cvgs = {}
+            # iterate over set of closely spaced sites (prox -> dist):
+            subsite_idx = -1
+            for subsite in site:
+                subsite_idx += 1
+                (mse_ratios, 
+                 us_mse_dict,
+                 ds_mse_dict,
+                 valid_conds_list) = check_mse_improvement(bp_coverages_dict,
+                                                           valid_mean_cvg_dict,
+                                                           region, 
+                                                           subsite, 
+                                                           strand, 
+                                                           segment_start,
+                                                           segment_end,
+                                                           min_coverage_region,  
+                                                           best_break_point_upstream_extension, 
+                                                           mse_ratio_threshold, 
+                                                           add_us_extension,
+                                                           segment_mse,
+                                                           extended_segment_mse)
+                 
+                # Use the MINIMAL median-ratio of this site (only from samples below the cutoff)
+                # check all conditions separately
+                # when ratio is the same: take the site with higher protocol support
+                # when two sites share the same top ratio AND the protocol support:
+                # take the more proximal site
+                curr_best_median_ratio = 2
+                curr_supporting_cvgs = {}
+                for cond in valid_conds_list:
+                    valid_ratios = [ i for i in mse_ratios[cond] if i < mse_ratio_threshold ]
+                    curr_supporting_cvgs[cond] = [ i < mse_ratio_threshold for i in mse_ratios[cond]]
+                    if np.median( valid_ratios ) < curr_best_median_ratio:
+                        curr_best_median_ratio = np.median( valid_ratios )
+                if curr_best_median_ratio < best_candidate_ratio:
+                    best_candidate_ratio = curr_best_median_ratio
+                    best_candidate_site = subsite
+                    best_candidate_prot_support = subsite[2]
+                    best_candidate_up_mse_dict = us_mse_dict
+                    best_candidate_ds_mse_dict = ds_mse_dict
+                    best_candidate_valid_conditions = valid_conds_list
+                    best_candidate_supporting_cvgs = curr_supporting_cvgs
+                elif( curr_best_median_ratio == best_candidate_ratio and
+                      subsite[2] > best_candidate_prot_support):
+                    best_candidate_ratio = curr_best_median_ratio
+                    best_candidate_site = subsite
+                    best_candidate_prot_support = subsite[2]
+                    best_candidate_up_mse_dict = us_mse_dict
+                    best_candidate_ds_mse_dict = ds_mse_dict
+                    best_candidate_valid_conditions = valid_conds_list
+                    best_candidate_supporting_cvgs = curr_supporting_cvgs
+            if best_candidate_ratio < 1:
+                # for this set of sites, a potentially used sites was found
+                if best_candidate_ratio < top_mse_ratio:
+                    top_site = best_candidate_site
+                    top_site_idx = site_idx
+                    top_mse_ratio = best_candidate_ratio
+                    top_site_prot_support = best_candidate_prot_support
+                    top_us_mse_dict = best_candidate_up_mse_dict
+                    top_ds_mse_dict = best_candidate_ds_mse_dict
+                    top_site_valid_conditions = best_candidate_valid_conditions
+                    top_site_supporting_cvgs = best_candidate_supporting_cvgs
+                    top_site_is_from_cluster = subsite_idx
+                # elif( best_candidate_ratio == top_mse_ratio and
+                #      top_site_prot_support < best_candidate_prot_support):
+                #     # do nothing and preferentially consider the more
+                #     # proximal site
+                #     continue
+                #     # top_site = best_candidate_site
+                #     # top_site_idx = site_idx
+                #     # top_site_prot_support = best_candidate_prot_support
+                #     # top_us_mse_dict = best_candidate_up_mse_dict
+                #     # top_ds_mse_dict = best_candidate_ds_mse_dict
+                #     # top_site_valid_conditions = best_candidate_valid_conditions
+                    
+        else:
+            # site is solitary
+
+            # only use the extended_segment_mse if necessary
+            if( (strand == "+" and (sites[site_idx][1].start
+                                    - segment_start
+                                    - best_break_point_upstream_extension) >= min_coverage_region) or
+                (strand == "-" and (segment_end
+                                    - site[1].end
+                                    - best_break_point_upstream_extension) >= min_coverage_region)):
+                extended_segment_mse = None
+
+            site = sites[site_idx]
+            (mse_ratios, 
+             us_mse_dict,
+             ds_mse_dict,
+             valid_conds_list) = check_mse_improvement(bp_coverages_dict,
+                                                       valid_mean_cvg_dict,
+                                                       region,
+                                                       site,
+                                                       strand,
+                                                       segment_start,
+                                                       segment_end,
+                                                       min_coverage_region,
+                                                       best_break_point_upstream_extension,
+                                                       mse_ratio_threshold,
+                                                       add_us_extension,
+                                                       segment_mse,
+                                                       extended_segment_mse)
+            
+            curr_best_median_ratio = 2
+            curr_supporting_cvgs = {}
+            for cond in valid_conds_list:
+                valid_ratios = [ i for i in mse_ratios[cond] if i < mse_ratio_threshold ]
+                curr_supporting_cvgs[cond] = [ i < mse_ratio_threshold for i in mse_ratios[cond]]
+                if np.median( valid_ratios ) < curr_best_median_ratio:
+                    curr_best_median_ratio = np.median( valid_ratios )
+            if curr_best_median_ratio < top_mse_ratio:
+                top_site = site
+                top_site_idx = site_idx
+                top_mse_ratio = curr_best_median_ratio
+                top_site_prot_support = site[2]
+                top_us_mse_dict = us_mse_dict
+                top_ds_mse_dict = ds_mse_dict
+                top_site_valid_conditions = valid_conds_list
+                top_site_supporting_cvgs = curr_supporting_cvgs
+                top_site_is_from_cluster = None
+            # elif( curr_best_median_ratio == top_mse_ratio and
+            #      top_site_prot_support < best_candidate_prot_support):
+            #     # do nothing and preferentially consider the more
+            #     # proximal site
+            #     continue
+            #     # top_site = site
+            #     # top_site_idx = site_idx
+            #     # top_site_prot_support = site[2]
+            #     # top_us_mse_dict = us_mse_dict
+            #     # top_ds_mse_dict = ds_mse_dict
+            #     # top_site_valid_conditions = valid_conds_list
+                
+    if top_site is not None:
+        # site with usage found
+
+        # update information for this site:
+        # valid conditions
+        top_site.append(top_site_valid_conditions)
+        # samples that indicate usage of this site
+        top_site.append(top_site_supporting_cvgs)
+        
+        upstream_sites = sites[:top_site_idx]
+        if len(upstream_sites) > 0:
+            # determine the characteristics 
+            # of the region upstream of the top site
+            if strand == "+":
+                up_seg_start = segment_start
+                # check if top site belonged to a set of closely spaced clusters
+                if top_site_is_from_cluster is None:
+                    up_seg_end = sites[top_site_idx][1].start
+                else:
+                    up_seg_end = sites[top_site_idx][top_site_is_from_cluster][1].start
+            else:
+                if top_site_is_from_cluster is None:
+                    up_seg_start = sites[top_site_idx][1].end
+                else:
+                    up_seg_start = sites[top_site_idx][top_site_is_from_cluster][1].end
+                up_seg_end = segment_end
+
+            if original_extended_segment_mse is not None:
+                # update the extended_segment_mse
+                extended_segment_mse = original_extended_segment_mse
+                for cond in bp_coverages_dict:
+                    extended_segment_mse[cond] = []
+                    for cvg_idx in range(len(bp_coverages_dict[cond])):
+                        cvg = bp_coverages_dict[cond][cvg_idx]
+                        if strand == "+":
+                            tmp_start_idx =  region.index(segment_start - add_us_extension[cond][cvg_idx])
+                            if top_site_is_from_cluster is None:
+                                tmp_end_idx = region.index(sites[top_site_idx][1].start)
+                            else:
+                                tmp_end_idx = region.index(sites[top_site_idx][top_site_is_from_cluster][1].start)
+                        else:
+                            if top_site_is_from_cluster is None:
+                                tmp_start_idx = region.index(sites[top_site_idx][1].end)
+                            else:
+                                tmp_start_idx = region.index(sites[top_site_idx][top_site_is_from_cluster][1].end)
+                            try:
+                                tmp_end_idx = region.index(segment_end + add_us_extension[cond][cvg_idx])
+                            except ValueError:
+                                # the end of the extended exon might not be accessible as position anymore
+                                tmp_end_idx = region.index(segment_end + add_us_extension[cond][cvg_idx] - 1) + 1
+
+                        extended_y = cvg[tmp_start_idx:tmp_end_idx]
+                        ext_seg_mean = np.mean( extended_y )
+                        ext_seg_diff = extended_y - ext_seg_mean
+                        ext_seg_mse = np.mean( ext_seg_diff**2)
+                        extended_segment_mse[cond].append( ext_seg_mse)
+                    
+            else:
+                # no extended_segment_mse necessary
+                extended_segment_mse = None       
+                
+        if top_site_idx < len(sites) - 2:
+            # downstream region also still contains potential sites
+            downstream_sites = sites[top_site_idx + 1:]
+            if strand == "+":
+                if top_site_is_from_cluster is None:
+                    ds_seg_start = sites[top_site_idx][1].end
+                else:
+                    ds_seg_start = sites[top_site_idx][top_site_is_from_cluster][1].end
+                ds_seg_end = segment_end
+            else:
+                ds_seg_start = segment_start
+                if top_site_is_from_cluster is None:
+                    ds_seg_end = sites[top_site_idx][1].start
+                else:
+                    ds_seg_end = sites[top_site_idx][top_site_is_from_cluster][1].start
+
+            if len(upstream_sites) > 0:
+                # up- and downstream remain sites that need to be checked
+                return( find_sites_with_usage(upstream_sites,
+                                              up_seg_start,
+                                              up_seg_end,
+                                              top_us_mse_dict,
+                                              extended_segment_mse,
+                                              bp_coverages_dict,
+                                              valid_mean_cvg_dict,
+                                              region,
+                                              strand,
+                                              min_coverage_region,
+                                              best_break_point_upstream_extension, 
+                                              mse_ratio_threshold, 
+                                              add_us_extension) + 
+                        [top_site] + 
+                        find_sites_with_usage(downstream_sites,
+                                              ds_seg_start,
+                                              ds_seg_end,
+                                              top_ds_mse_dict,
+                                              None, # do not provide an extended upstream mse
+                                              bp_coverages_dict,
+                                              valid_mean_cvg_dict,
+                                              region,
+                                              strand,
+                                              min_coverage_region,
+                                              best_break_point_upstream_extension, 
+                                              mse_ratio_threshold, 
+                                              add_us_extension))
+            else:
+                # only downstream remain sites that need to be checked
+                return( [top_site] + 
+                        find_sites_with_usage(downstream_sites,
+                                              ds_seg_start,
+                                              ds_seg_end,
+                                              top_ds_mse_dict,
+                                              None,
+                                              bp_coverages_dict,
+                                              valid_mean_cvg_dict,
+                                              region,
+                                              strand,
+                                              min_coverage_region,
+                                              best_break_point_upstream_extension, 
+                                              mse_ratio_threshold, 
+                                              add_us_extension))
+            
+        elif len(upstream_sites) > 0:
+            # only upstream remain sites that need to be checked
+            return( find_sites_with_usage(upstream_sites,
+                                          up_seg_start,
+                                          up_seg_end,
+                                          top_us_mse_dict,
+                                          extended_segment_mse,
+                                          bp_coverages_dict,
+                                          valid_mean_cvg_dict,
+                                          region,
+                                          strand,
+                                          min_coverage_region,
+                                          best_break_point_upstream_extension, 
+                                          mse_ratio_threshold, 
+                                          add_us_extension) + 
+                        [top_site] )
+                    
+        else:
+            # return the current site as only possible site with usage
+            return [top_site]
+            
+    else:
+        # no site with usage found anymore
+        return []
+    
+def get_pA_set(exon_id, 
+               exon_structure, 
+               region, 
+               bp_coverages_dict, 
+               merged_exons, 
+               polyA_sites,
+               exon_ext_dict,
+               options):
+    """Define a set of used poly(A) sites"""
+
+    strand = exon_structure[1]
+    # save the strand specific exon start (used during distal site determination)
+    ex_start = exon_structure[2] if strand == "+" else exon_structure[3]
+    # store the poly(A) sites that are inferred to be used in the current exon
+    used_sites = []
+    # get the exon upstream extensions
+    add_us_ext = exon_ext_dict
+    
+    # # test
+    # syserr("[INFO] ##### Processing of exon %s #####\n" % exon_id)
+    
+    ######
+    # first part
+    ######
+    
+    ### define the distal site
+
+    distal_site = None
+    best_fit_pos = None
+    # iterate over the sites from distal to proximal
+    for site_idx in range(len(polyA_sites))[::-1]:
+            if isinstance(polyA_sites[site_idx][0], list):
+                # current "site" is a set of closely spaced sites
+                # that are processed together
+                site_found = False
+                # iterate over the set of sites, from distal to proximal
+                for subsite_idx in range(len(polyA_sites[site_idx]))[::-1]:
+                    site = polyA_sites[site_idx][subsite_idx]
+                    (best_fit_pos, best_fit) = find_distal_site(site,
+                                                                ex_start,
+                                                                strand,
+                                                                region,
+                                                                merged_exons, 
+                                                                options.max_downstream_coverage,
+                                                                options.read_length,
+                                                                options.ds_reg_for_no_coverage,
+                                                                options.debug)
+                    if best_fit == -1:
+                        # skip exon because it has no exon start coverage
+                        # or the currently processed site already overlaps
+                        # with the exon start
+                        site_found = True
+                        break
+                        
+                    if best_fit_pos is not None:
+                        # a valid regression line was fitted to 
+                        # the current upstream region
+                        distal_site = site
+                        site_found = True
+                        break
+                if site_found:
+                    # break also outer loop that runs over all poly(A)sites
+                    break
+                    
+            else:
+                # the current site is a solitary site
+                site = polyA_sites[site_idx]
+                (best_fit_pos, best_fit) = find_distal_site(site,
+                                                            ex_start,
+                                                            strand,
+                                                            region,
+                                                            merged_exons,
+                                                            options.max_downstream_coverage,
+                                                            options.read_length,
+                                                            options.ds_reg_for_no_coverage,
+                                                            options.debug)
+                if best_fit == -1:
+                    # skip exon because it has no exon start coverage
+                    # or the currently processed site already overlaps
+                    # with the exon start
+                    break
+                    
+                if best_fit_pos is not None:
+                    distal_site = site
+                    # a valid regression line was fitted to 
+                    # the current upstream region
+                    break
+
+    if distal_site is None:
+        if options.debug:
+            syserr("[INFO] No distal site found for %s. Exon skipped\n" 
+                   % exon_id)
+        return ([], {})
+        
+    # limit the set of sites to the ones upstream of the distal site
+    # (if the distal sites comes from a set of closely spaced sites
+    # the entire set is marked as distal and excluded from further analysis)
+    polyA_sites = polyA_sites[0:site_idx]
+        
+    # adjust the exon end according to the distal site end
+    if strand == "+":
+        exon_structure[3] = distal_site[1].end
+    else:
+        exon_structure[2] = distal_site[1].start
+
+    ######
+    # second part
+    ######
+
+    # check if the mean coverage per sample is above the given cutoff
+    # return a dict with the conditions as keys and a list as value
+    # that holds 0 or 1 at idx-positions that correspond to the coverages
+    valid_mean_cvg_dict = assess_minimum_cvg(exon_structure, bp_coverages_dict, region, options.min_mean_coverage)
+
+    if sum([i for lst in valid_mean_cvg_dict.itervalues() for i in lst]) == 0:
+        if options.debug:
+            syserr("[INFO] No sample with sufficient coverage for %s. Exon skipped\n" 
+                   % exon_id)
+        return ([],valid_mean_cvg_dict)
+
+    ######
+    # third part
+    ######
+
+    # handle cases in which only the distal sites remained
+    if len(polyA_sites) == 0:
+        # check exon region for global best break point
+        curr_ex_start = exon_structure[2]
+        curr_ex_end = exon_structure[3]
+        valid_break_point_dict = find_best_break_point_in_full_exon(bp_coverages_dict,
+                                                                    valid_mean_cvg_dict,
+                                                                    region,
+                                                                    strand,
+                                                                    curr_ex_start,
+                                                                    curr_ex_end,
+                                                                    options.min_coverage_region,
+                                                                    options.mse_ratio_threshold,
+                                                                    add_us_ext)
+        
+        
+        # return distal site together with the valid_mean_cvg_dict!
+        # NOT the valid_mean_cvg_dict!
+        return (polyA_sites, valid_mean_cvg_dict)
+
+    ######
+    # fourth part
+    ######
+            
+    # find sites with usage
+    
+    # define the mse of the total region for all samples
+    mse_total_region = {}
+    mse_total_ext_region = {}
+    for cond in bp_coverages_dict:
+        mse_total_region[cond] = []
+        mse_total_ext_region[cond] = []
+        cvg_idx = -1
+        for cvg in bp_coverages_dict[cond]:
+            cvg_idx += 1
+
+            # do not consider coverages with little cvg
+            if valid_mean_cvg_dict[cond][cvg_idx] == 0:
+                mse_total_region[cond].append(0)
+                mse_total_ext_region[cond].append(0)
+                continue
+            
+            if strand == "+":
+                start_idx = region.index(exon_structure[2])
+                extended_start_idx =  region.index(exon_structure[2] - add_us_ext[cond][cvg_idx])
+
+                end_idx = region.index(exon_structure[3])
+            else:
+                start_idx = region.index(exon_structure[2])
+                try:
+                    end_idx = region.index(exon_structure[3])
+                except ValueError:
+                    end_idx = region.index(exon_structure[3] - 1) + 1
+                try:
+                    extended_end_idx = region.index(exon_structure[3] + add_us_ext[cond][cvg_idx])
+                except ValueError:
+                    # the end of the extended exon might not be accessible as position anymore
+                    extended_end_idx = region.index(exon_structure[3] + add_us_ext[cond][cvg_idx] - 1) + 1
+
+            if strand == "+":
+                y = cvg[start_idx:end_idx]
+                extended_y = cvg[extended_start_idx:end_idx]
+            else:
+                # for the mse calculation, the orientation of the coverage is irrelevant
+                y = cvg[start_idx:end_idx]
+                extended_y = cvg[start_idx:extended_end_idx]
+            reg_mean = np.mean( y )
+            reg_diff = y - reg_mean
+            reg_mse = np.mean( reg_diff**2)
+            mse_total_region[cond].append(reg_mse)
+            
+            # also calculate the mse 
+            # with the exon upstream extension (might be used later)
+            ext_reg_mean = np.mean( extended_y )
+            ext_reg_diff = extended_y - ext_reg_mean
+            ext_reg_mse = np.mean( ext_reg_diff**2)
+            mse_total_ext_region[cond].append(ext_reg_mse)
+
+    used_sites = find_sites_with_usage(polyA_sites,
+                                       exon_structure[2],
+                                       exon_structure[3],
+                                       mse_total_region,
+                                       mse_total_ext_region,
+                                       bp_coverages_dict,
+                                       valid_mean_cvg_dict,
+                                       region,
+                                       strand,
+                                       options.min_coverage_region,
+                                       options.best_break_point_upstream_extension, 
+                                       options.mse_ratio_threshold, 
+                                       add_us_ext)
+
+    # append distal site
+    used_sites.append(distal_site)
+
+    # # test
+    # syserr("sites of exon %s\n" % exon_id)
+    # for site_list in used_sites:
+    #     syserr("%s\n" % str(site_list))
+
+    ######
+    # fivth part
+    ######
+
+    # go over the exon once more
+    # check if any better global break point exists
+
+    unsupported_break_point_cases = 0
+    if len(used_sites) > 1:
+        # create a list containing all genomic positions covered by the 
+        # used poly(A) sites
+        still_undetected = used_sites[:-1]
+
+        # prepare individual dicts for every sample:
+
+        # genomic pos' that are covered by sites in each sample
+        used_sites_positions = {}
+        # exon boundaries
+        curr_ex_start = exon_structure[2]
+        curr_ex_start_dict = {}
+        for cond in bp_coverages_dict:
+            curr_ex_start_dict[cond]=[]
+            for i in range(len(bp_coverages_dict[cond])):
+                # initialize the exon start with the annotated start pos
+                # append the extended upstream region later
+                curr_ex_start_dict[cond].append(curr_ex_start)
+        curr_ex_end = exon_structure[3]
+        curr_ex_end_dict = {}
+        # same for exon end that is appended by the upstream region
+        # later in the case of minus strand exons
+        for cond in bp_coverages_dict:
+            curr_ex_end_dict[cond]=[]
+            for i in range(len(bp_coverages_dict[cond])):
+                curr_ex_end_dict[cond].append(curr_ex_end)
+        
+        # treat the most proximal site separately;
+        # here, for different samples the site might cover different genomic regions because of
+        # variable upstream_extend_regions
+        if strand == "+":
+            start_pos = used_sites[0][1].start - options.best_break_point_upstream_extension
+            end_pos = used_sites[0][1].end
+            end_pos_idx = region.index(end_pos)
+        else:
+            start_pos = used_sites[0][1].start
+            start_pos_idx = region.index(start_pos)
+            end_pos = used_sites[0][1].end + options.best_break_point_upstream_extension
+        for cond in bp_coverages_dict:
+            # also, prepare the dict to store the covered genomic positions
+            used_sites_positions[cond] = []
+            
+            for cvg_idx in range(len(bp_coverages_dict[cond])):
+                used_sites_positions[cond].append({})
+
+                if cond not in still_undetected[0][3]:
+                    # this site is not supported by any sample of this condition
+                    # hence, no dict preparation necessary
+                    continue
+
+                # only treat the current sample if it supports any of the
+                # inferred poly(A) sites
+                if not still_undetected[0][4][cond][cvg_idx]:
+                    continue
+                
+                curr_add_us_ext = add_us_ext[cond][cvg_idx]
+                if strand == "+":
+                    if start_pos < curr_ex_start - curr_add_us_ext:
+                        # start pos (site_start - best-break-point-extension is even more upstream
+                        # than the extended exon; hence, use the extended exon as boundary
+                        start_pos_idx = region.index( curr_ex_start - curr_add_us_ext )
+                        curr_ex_start_dict[cond][cvg_idx] -= curr_add_us_ext
+                    elif start_pos < curr_ex_start:
+                        # start pos is more upstream than the unextended exon start;
+                        # hence, use this start_pos but adjust exon start pos
+                        start_pos_idx = region.index( start_pos )
+                        curr_ex_start_dict[cond][cvg_idx] -= curr_add_us_ext
+                    else:
+                        # start_pos is within normal exon annotation
+                        start_pos_idx = region.index(start_pos)
+                else:
+                    # same for negative strand exons; see comments above for explanation
+                    if end_pos  >= curr_ex_end + curr_add_us_ext:
+                        end_pos_idx = region.index( curr_ex_end + curr_add_us_ext -1 ) + 1
+                        curr_ex_end_dict[cond][cvg_idx] += curr_add_us_ext
+                    elif end_pos > curr_ex_end:
+                        end_pos_idx = region.index( end_pos )
+                        curr_ex_end_dict[cond][cvg_idx] += curr_add_us_ext
+                    else:
+                        end_pos_idx = region.index(end_pos)
+                for i in range(start_pos_idx, end_pos_idx):
+                    used_sites_positions[cond][cvg_idx][i] = 0
+        
+        # iterate over all other sites except the most proximal one
+        # for these sites, the covered genomic positions are all the same for all samples
+        for site_idx in range(1,len(still_undetected)):
+
+            if strand == "+":
+                start_pos = (used_sites[site_idx][1].start
+                             - options.best_break_point_upstream_extension)
+
+                # first: make the region as long as necessary but not longer than possible
+                if start_pos < used_sites[site_idx - 1][1].end:
+                    start_pos_idx = region.index( used_sites[site_idx - 1][1].end )
+                else:
+                    start_pos_idx = region.index(start_pos)
+
+                end_pos = used_sites[site_idx][1].end
+                end_pos_idx = region.index(end_pos)
+            else:
+                start_pos = used_sites[site_idx][1].start
+                start_pos_idx = region.index(start_pos)
+                
+                end_pos = (used_sites[site_idx][1].end
+                           + options.best_break_point_upstream_extension)
+
+                # first: make the region as long as necessary but not longer than possible
+                if end_pos > used_sites[site_idx - 1][1].start:
+                    end_pos_idx = region.index( used_sites[site_idx - 1][1].start)
+                else:
+                    end_pos_idx = region.index(end_pos)
+
+            # mark the genomic positions for all relevant samples
+            for cond in bp_coverages_dict:
+                if cond not in still_undetected[site_idx][3]:
+                    # this site is not supported by any sample of this condition
+                    # hence, no dict preparation necessary
+                    continue
+                for cvg_idx in range(len(bp_coverages_dict[cond])):
+                    # only treat the current sample if it supports any of the
+                    # inferred poly(A) sites
+                    if not still_undetected[site_idx][4][cond][cvg_idx]:
+                        continue
+                    for i in range(start_pos_idx, end_pos_idx):
+                        used_sites_positions[cond][cvg_idx][i] = site_idx
+
+                        
+        # if the proximal site is close to the exon start, the 
+        # extension of the exon is used
+        # during inference of the best break point
+        # (this was ensured above when the exon
+        # start positions were inferred individually for every sample)
+
+        # iterate over all samples and infer the global best break point(s)
+        # separately for each sample when this sample supports any
+        # inferred poly(A) site
+        for cond in bp_coverages_dict:
+            for cvg_idx in range(len(bp_coverages_dict[cond])):
+                sample_specific_still_undetected = []
+                # collect the sites that are supported by the current sample
+                for site_idx in range(len(still_undetected)):
+                    if cond not in still_undetected[site_idx][3]:
+                        # this site is not supported by any sample of this condition
+                        sample_specific_still_undetected.append( None )
+                        continue
+                    if still_undetected[site_idx][4][cond][cvg_idx]:
+                        sample_specific_still_undetected.append( still_undetected[site_idx] )
+                    else:
+                        sample_specific_still_undetected.append( None )
+
+                if len([i for i in sample_specific_still_undetected if i is not None]) == 0:
+                    # this sample does not support any poly(A) site;
+                    # skip global break point inference for it
+                    continue
+
+
+                start_idx = region.index(curr_ex_start_dict[cond][cvg_idx])
+                end_idx = region.index(curr_ex_end_dict[cond][cvg_idx] - 1) + 1
+
+                global_mse_result = check_global_best_breaking_points(bp_coverages_dict[cond][cvg_idx],
+                                                                      cond,
+                                                                      cvg_idx,
+                                                                      region,
+                                                                      used_sites_positions[cond][cvg_idx],
+                                                                      sample_specific_still_undetected,
+                                                                      start_idx,
+                                                                      end_idx,
+                                                                      options.min_coverage_region)
+
+                if isinstance(global_mse_result, tuple):
+                    # # test -->
+                    # boundaries = [region[x] for x in global_mse_result[0][:-1] ]
+                    # try:
+                    #     tmp = region[ global_mse_result[0][-1] ]
+                    # except IndexError:
+                    #     tmp = region[-1] + 1
+                    # boundaries.append(tmp)
+                    # syserr("[ERROR] Break points so far: %s\n" % str(boundaries))
+                    # # <-- test
+
+                    if global_mse_result[1] is None:
+                        syserr("[ERROR] No (further) break point found for exon: %s\n" 
+                               % exon_id)
+                        return ([], valid_mean_cvg_dict)
+                    else:
+                        # unsupported break point for current coverage detected
+                        
+                        # syserr("[ERROR] Unsupported break point %i in sample #%i (cond %s) for exon: %s. Exclude this sample for the current exon!\n" 
+                        #        % (region[ global_mse_result[1] ], cvg_idx, cond, exon_id) )
+                        # return ([], valid_mean_cvg_dict,add_us_ext)
+                        valid_mean_cvg_dict[cond][cvg_idx] = -1
+                        unsupported_break_point_cases += 1
+                        # retract support for any site by this sample
+                        for tmp_site_idx in range(len(used_sites) - 1):
+                            if cond in used_sites[tmp_site_idx][3]:
+                                used_sites[tmp_site_idx][4][cond][cvg_idx] = False
+                        continue
+            
+                # elif global_mse_result is "Error":
+                #     return ([], valid_mean_cvg_dict, add_us_ext)
+
+    else:
+        curr_ex_start = exon_structure[2]
+        curr_ex_end = exon_structure[3]
+        valid_break_point_dict = find_best_break_point_in_full_exon(bp_coverages_dict,
+                                                                    valid_mean_cvg_dict,
+                                                                    region,
+                                                                    strand,
+                                                                    curr_ex_start,
+                                                                    curr_ex_end,
+                                                                    options.min_coverage_region,
+                                                                    options.mse_ratio_threshold,
+                                                                    add_us_ext)
+
+        # return distal site together with the valid_mean_cvg_dict!
+        # NOT the valid_mean_cvg_dict!
+        return (used_sites, valid_break_point_dict)
+
+    if unsupported_break_point_cases > 0:
+        syserr("[INFO] For exon %s, %i samples are not further processed due to unsupported global break points in their coverage\n" % (exon_id, unsupported_break_point_cases))
+    return (used_sites, valid_mean_cvg_dict)
+
+def get_distal_expression(used_sites,
+                        exon_structure,
+                        exon_id,
+                        region,
+                        bp_coverages_dict,
+                        valid_break_point_found,
+                        read_length,
+                        min_coverage_region):
+    '''Define the expression of the distal
+    site if only this one is inferred
+    '''
+
+    expression_dict = {}
+
+    strand = exon_structure[1]
+
+    if exon_structure[3] - read_length - exon_structure[2] < min_coverage_region:
+        syserr("[INFO] Single distal site of exon %s is very close to exon start. " + 
+               "[No reasonable expression estimation possible. Skip exon\n" % str(exon_id))
+        return None
+    
+    if strand == "+":
+        start_idx = region.index( exon_structure[2] )
+        end_idx = region.index( exon_structure[3] - read_length)
+    else:
+        end_idx = region.index( exon_structure[3] - 1) + 1
+        start_idx = region.index( exon_structure[2]  + read_length)
+
+    for cond in bp_coverages_dict:
+        expression_dict[cond] = []
+        for cvg_idx in range(len(bp_coverages_dict[cond])):
+
+            # only infer an expression value if the sample
+            # has valid coverage and no global break point
+            # here, valid_break_point_found is a dict of True and False values
+            # due to function find_best_break_point_in_full_exon
+            # True: there is a valid break point!!!
+            if valid_break_point_found[cond][cvg_idx]:
+                expression_dict[cond].append(-1)
+                continue
+            
+            cvg = bp_coverages_dict[cond][cvg_idx]
+            mean_cvg = np.mean( cvg[start_idx:end_idx] )
+            expr = float("{0:.2f}".format( mean_cvg ))
+            expression_dict[cond].append(expr)
+
+    return expression_dict
+    
+def get_relative_usages(used_sites,
+                        exon_structure,
+                        exon_id,
+                        region,
+                        bp_coverages_dict,
+                        valid_mean_cvg_dict,
+                        read_length,
+                        min_coverage_region,
+                        add_us_extension):
+    """For the obtained sites, infer the relative 
+    usage based on the mean expression of regions"""
+    
+    # define the indices that mark the segment boundaries
+    # store them from proximal to distal
+    strand = exon_structure[1]
+    region_means = {}
+    ds_corrected_means = {}
+    relative_usages = {}
+    expressions = {}
+
+    for cond in bp_coverages_dict:
+        region_means[cond] = []
+        relative_usages[cond] = []
+        expressions[cond] = []
+        ds_corrected_means[cond] = []
+        cvg_counter = -1
+        for cvg in bp_coverages_dict[cond]:
+            cvg_counter += 1
+            
+	    if strand == "+":
+	        start_idx = region.index( exon_structure[2] )
+	        if used_sites[0][1].start - read_length - min_coverage_region < exon_structure[2]:
+	            start_idx = region.index( exon_structure[2] - add_us_extension[cond][cvg_counter] )
+                try:
+	            end_idx = region.index( used_sites[0][1].start - read_length )
+                except ValueError:
+                    end_idx = region.index( exon_structure[2] - add_us_extension[cond][cvg_counter] + 1)
+	        if end_idx - start_idx < min_coverage_region:
+	            syserr(("[INFO] Distance from proximal site to exon start of exon %s cond %s sample #%i " +
+	                    "is very small despite upstream extension (%i nt). For relative usage inference, " +
+	                    "the read length is not excluded from the upstream region of the proximal site\n")
+	                   % (exon_id, cond, cvg_counter, add_us_extension[cond][cvg_counter]))
+	            end_idx = region.index( used_sites[0][1].start )
+	    else:
+	        if used_sites[0][1].end  + read_length + min_coverage_region > exon_structure[3]:
+                    end_idx = region.index( exon_structure[3] + add_us_extension[cond][cvg_counter] - 1) + 1
+	        else:
+	            end_idx = region.index( exon_structure[3] - 1) + 1
+                try:
+	            start_idx = region.index( used_sites[0][1].end  + read_length)
+                except ValueError:
+                    start_idx = end_idx = region.index( exon_structure[3] + add_us_extension[cond][cvg_counter] - 2) + 1
+	        if end_idx - start_idx < min_coverage_region:
+	            syserr(("[INFO] Distance from proximal site to exon start of exon %s cond %s sample #%i " +
+	                    "is very small despite upstream extension (%i nt). For relative usage inference, " +
+	                    "the read length is not excluded from the upstream region of the proximal site\n")
+	                   % (exon_id, cond, cvg_counter, add_us_extension[cond][cvg_counter]))
+	            start_idx = region.index( used_sites[0][1].end )
+       
+            region_means[cond].append([])
+            relative_usages[cond].append([])
+            expressions[cond].append([])
+            ds_corrected_means[cond].append([])
+            if (valid_mean_cvg_dict[cond][cvg_counter] == 0 or
+                valid_mean_cvg_dict[cond][cvg_counter] == -1):
+                region_means[cond][cvg_counter].append(0)
+            else:
+                y = cvg[start_idx:end_idx]
+                region_means[cond][cvg_counter].append(np.mean(y))
+            
+    # iterate over all poly(A) sites (from prox to dist) and get the mean 
+    for site_idx in range(1, len(used_sites)):
+        if strand == "+":
+            start_idx = region.index( used_sites[site_idx-1][1].end )
+            end_idx = region.index( used_sites[site_idx][1].start - read_length )
+        else:
+            start_idx = region.index( used_sites[site_idx][1].end  + read_length )
+            end_idx = region.index( used_sites[site_idx-1][1].start )
+        
+        # consistency check: is region big enough
+        if end_idx - start_idx < min_coverage_region:
+            syserr(("[ERROR] Upstream region of site %s is too small " +
+                    "for reliable mean estimation\n") % used_sites[site_idx][0])
+            return {}, {}
+    
+        for cond in bp_coverages_dict:
+            counter = -1
+            for cvg in bp_coverages_dict[cond]:
+                counter += 1
+                if (valid_mean_cvg_dict[cond][counter] == 0 or
+                    valid_mean_cvg_dict[cond][counter] == -1):
+                    region_means[cond][counter].append(0)
+                else:
+                    y = cvg[start_idx:end_idx]
+                    region_means[cond][counter].append(np.mean(y))
+                
+    
+    # now, all mean values are obtained
+    # next: calculate downstream corrected mean values
+    # from distal to proximal site
+    for cond in region_means:
+        for sample_arr_idx in range(len(region_means[cond])):
+            if (valid_mean_cvg_dict[cond][sample_arr_idx] == 0 or
+                valid_mean_cvg_dict[cond][sample_arr_idx] == -1):
+                continue
+            ds_corrected_means[cond][sample_arr_idx].append( region_means[cond][sample_arr_idx][-1] )
+            
+    for site_idx in reversed(range(len(used_sites) -1)):
+        for cond in region_means:
+            for sample_arr_idx in range(len(region_means[cond])):
+                if (valid_mean_cvg_dict[cond][sample_arr_idx] == 0 or
+                    valid_mean_cvg_dict[cond][sample_arr_idx] == -1):
+                    continue
+                downstream_corrected_mean = region_means[cond][sample_arr_idx][site_idx] - sum(ds_corrected_means[cond][sample_arr_idx])
+                if downstream_corrected_mean < 0:
+                    downstream_corrected_mean = 0
+                ds_corrected_means[cond][sample_arr_idx].insert(0, downstream_corrected_mean)
+
+    # iterate once more over the mean values and calculate actual relative usage values
+    for site_idx in range(len(used_sites)):
+        for cond in region_means:
+            for sample_arr_idx in range(len(region_means[cond])):
+                if (valid_mean_cvg_dict[cond][sample_arr_idx] == 0 or
+                    valid_mean_cvg_dict[cond][sample_arr_idx] == -1):
+                    relative_usages[cond][sample_arr_idx].append(-1.0)
+                    expressions[cond][sample_arr_idx].append(-1.0)
+                else:
+                    rel_use = float("{0:.2f}".format( float(ds_corrected_means[cond][sample_arr_idx][site_idx]) / sum(ds_corrected_means[cond][sample_arr_idx]) * 100 ) )
+                    expr = float("{0:.2f}".format( float(ds_corrected_means[cond][sample_arr_idx][site_idx]) ))
+                    relative_usages[cond][sample_arr_idx].append(rel_use)
+                    expressions[cond][sample_arr_idx].append(expr)
+                
+    return relative_usages, expressions
+
+def process_exon_wrapper_function( input_tuple ):
+    """Wrapper function to conduct the 
+    actual processing of each exon and its sites"""
+
+    exon, exon_structure, region, bp_coverages_dict, merged_exons, polyA_sites, exon_ext_dict, options = list(input_tuple)
+
+    (used_sites, valid_mean_cvg_dict) = get_pA_set(exon, 
+                                                   exon_structure, 
+                                                   region, 
+                                                   bp_coverages_dict, 
+                                                   merged_exons, 
+                                                   polyA_sites,
+                                                   exon_ext_dict,
+                                                   options)
+
+    # ATTENTION:
+    # valid_mean_cvg_dict might be the valid_break_point_dict if only the distal site
+    # was quantified
+    if len(used_sites) > 1:
+        mean_cvg_indicator_set = set()
+        for cond in valid_mean_cvg_dict:
+            for i in valid_mean_cvg_dict[cond]:
+                mean_cvg_indicator_set.add(i)
+        if len(mean_cvg_indicator_set) == 1 and list(mean_cvg_indicator_set)[0] == 0:
+            # no sample had enough coverage
+            syserr("[INFO] For no sample global best break points and inferred sites matched for exon %s. Skipped exon!\n"
+                   % str(exon))
+            return (exon, [],[],[])
+        elif len(mean_cvg_indicator_set) == 1 and list(mean_cvg_indicator_set)[0] == -1:
+            # no sample had proper global best break points
+            syserr("[INFO] For no sample global best break points and inferred sites matched for exon %s. Skipped exon!\n"
+                   % str(exon))
+            return (exon, [],[],[])
+        elif (len(mean_cvg_indicator_set) == 2 and
+              0 in mean_cvg_indicator_set and
+              -1 in mean_cvg_indicator_set):
+            # either the samples had too little coverage or their global best break point did not match
+            syserr("[INFO] All sample were discarded either due to global best break point inconsistencies or low coverage for exon %s. Skipped exon!\n"
+                   % str(exon))
+            return (exon, [],[],[])
+
+        # iterate over all sites except the distal one
+        # it might occur that a site lost support by all its supporting coverages
+        # due to global break point inconsistencies. -> Then, remove this site
+        to_del_idx = []
+        for site_idx in range(len(used_sites) - 1):
+            supporting_samples = sum([i for b in used_sites[site_idx][3] for i in used_sites[site_idx][4][b] ] )
+            if 0 == supporting_samples:
+                to_del_idx.append(site_idx)
+
+        for site_idx in sorted(to_del_idx)[::-1]:
+            del used_sites[ site_idx ]
+        
+        if len(to_del_idx) > 0:
+            syserr("[INFO] Number of sites was reduced for exon %s. They had no support by any sample anymore due to global break point inconsistencies\n"
+                   % str(exon))
+
+        if len(used_sites) == 1:
+            # after filtering bogus sites
+            # only the distal site remained -> delete this one as well
+            used_sites = []
+
+    if len(used_sites) > 1:
+        relative_usage_dict, expression_dict = get_relative_usages(used_sites,
+                                                                   exon_structure,
+                                                                   exon,
+                                                                   region,
+                                                                   bp_coverages_dict,
+                                                                   valid_mean_cvg_dict,
+                                                                   options.read_length,
+                                                                   options.min_coverage_region,
+                                                                   exon_ext_dict)
+
+
+        if len(relative_usage_dict) > 0:
+            rel_use_array = []
+            expression_array = []
+            for site_idx in range(len(used_sites)):
+                conditions_counter = {}
+                rel_use_array.append([])
+                expression_array.append([])
+                for file_idx in range(len(options.coverages)):
+                    condition = options.conditions[file_idx]
+                    if condition not in conditions_counter:
+                        conditions_counter[condition] = -1
+                    conditions_counter[condition] += 1
+                    sample = conditions_counter[condition]
+                    rel_use_array[-1].append( relative_usage_dict[condition][ sample ][site_idx])
+                    expression_array[-1].append( expression_dict[condition][ sample ][site_idx])
+
+            return (exon, used_sites, rel_use_array, expression_array)
+
+        else:
+            # no relative usages obtained
+            syserr("[ERROR] Failed to infer relative usages for %s\n" 
+                   % str(exon) )
+            return (exon, [],[], [])
+    elif len(used_sites) == 1:
+        # not multiple sites inferred
+        # obtain the expression for the distal site in all samples
+        # without global break point
+        expression_dict = get_distal_expression(used_sites,
+                                                exon_structure,
+                                                exon,
+                                                region,
+                                                bp_coverages_dict,
+                                                valid_mean_cvg_dict,
+                                                options.read_length,
+                                                options.min_coverage_region)
+
+        if expression_dict is None:
+            return (exon, [],[], [])
+        else:
+            # check the case that all samples have a valid global break point
+            total_sample_cnt = 0
+            for cond in expression_dict:
+                for cvg_idx in expression_dict[cond]:
+                    total_sample_cnt += 1
+            if sum([i for c in expression_dict for i in expression_dict[c]]) == -total_sample_cnt:
+                   # no sites inferred
+                   syserr("[INFO] Distal site skipped due to conflicting global break points in all samples in exon %s. Skipped exon!\n"
+                          % str(exon))
+                   return (exon, [],[], [])
+            
+            expression_array = []
+            conditions_counter = {}
+            expression_array.append([])
+            for file_idx in range(len(options.coverages)):
+                condition = options.conditions[file_idx]
+                if condition not in conditions_counter:
+                    conditions_counter[condition] = -1
+                conditions_counter[condition] += 1
+                sample = conditions_counter[condition]
+                expression_array[-1].append( expression_dict[condition][ sample ])
+            return (exon, used_sites, None, expression_array)
+
+    else:
+        # no sites inferred
+        syserr("[INFO] No site was inferred for %s. Skipped exon!\n"
+               % str(exon))
+        return (exon, [],[], [])
+
+def main(options):
+    """Main logic of the script"""
+
+    # initialize the pool for multimapping
+    pool = multiprocessing.Pool( options.proc )
+
+    expression_output_file_name = options.expressions_out
+    distal_sites_output_file_name = options.distal_sites
+    
+    ###
+    # load poly(A) sites
+    ###
+
+    # store the terminal exons
+    terminal_exons_dict = {}
+
+    # store the exon upstream extension size per sample
+    term_ex_us_ex_dict = {}
+    
+    with open(options.polyA) as f:
+        for line in f:
+            F = line.strip().split("\t")
+            chrom = F[0]
+            strand = F[5]
+            if F[8] not in terminal_exons_dict:
+                # save stats for this exon and an empty list for the poly(A) sites
+                # adjust the start coordinate of the exon to match
+                # HTSeq and bed conventions
+                exon_stats = F[8].split(":")
+                exon_start = int(exon_stats[3]) - 1
+                exon_end = int(exon_stats[4])
+                terminal_exons_dict[ F[8] ] = [chrom, strand, exon_start, exon_end, [] ]
+            # for each poly(A) site store: id, HTSeq.GenomicInterval of its location, protocol support
+            terminal_exons_dict[ F[8] ][4].append( [ F[3], HTSeq.GenomicInterval( chrom, int(F[1]), int(F[2]), strand ), F[4] ] )
+            term_ex_us_ex_dict[ F[8] ] = {}
+            
+    # go through list of all terminal exons and separate two lists:
+    # 1. exons with clusters that assemble in close proximity
+    # 2. exons with distinct clusters
+    # clusters are expected to be sorted proximal -> distal
+    for exon in terminal_exons_dict:
+        strand = terminal_exons_dict[exon][1]
+        list_of_close_clusters = []
+        close_cluster_idxs = set()
+        polyA_sites = terminal_exons_dict[ exon ][4]
+        for cl_idx in range(1, len(polyA_sites)):
+            prox_site = polyA_sites[cl_idx -1]
+            dist_site = polyA_sites[cl_idx]
+            if ( (strand == "+" and (dist_site[1].start - prox_site[1].end) < options.min_cluster_distance) or
+            (strand == "-" and (prox_site[1].start - dist_site[1].end) < options.min_cluster_distance) ):
+                # note these two sites as interferring
+                if (cl_idx - 1) in close_cluster_idxs:
+                    # simply update the set of close clusters
+                    close_cluster_idxs.update([cl_idx])
+                else:
+                    # remove all clusters that are in the set so far
+                    # store them separately as one unit
+                    # update the set of closely spaced sites
+                    if len(close_cluster_idxs) > 0:
+                        if len(close_cluster_idxs) < 2:
+                            syserr("[ERROR] set of closely spaced sites has less than 2 entries (%s) for exon %s\n" % (str(close_cluster_idxs), exon))
+                            sys.exit(2) 
+                        list_of_close_clusters.append(sorted(close_cluster_idxs))
+                        close_cluster_idxs = set()
+                    close_cluster_idxs.update([(cl_idx-1), cl_idx])
+        # empty set if necessary
+        if len(close_cluster_idxs) > 0:
+            list_of_close_clusters.append(sorted(close_cluster_idxs))
+                        
+        if len(list_of_close_clusters) > 0:
+            new_pA_list = []
+            last_processed_pA = 0
+            for close_clusters in list_of_close_clusters:
+                for idx in range(last_processed_pA,close_clusters[0]):
+                    new_pA_list.append(polyA_sites[idx])
+                tmp_list = []
+                for close_cl in close_clusters:
+                    tmp_list.append(polyA_sites[close_cl])
+                new_pA_list.append(tmp_list)
+                last_processed_pA = close_clusters[-1] + 1
+            for idx in range(last_processed_pA, len(polyA_sites)):
+                new_pA_list.append(polyA_sites[idx])
+            terminal_exons_dict[exon][4] = new_pA_list
+
+    ###
+    # load the coverages
+    # and conditions
+    ###
+
+    coverage_pkl_dict = {}
+    for pkl_idx in range(len(options.coverages)):
+        extension_file = None
+        pkl_file = options.coverages[pkl_idx]
+        
+        sample_basename = pkl_file.replace(".pkl", "")
+        sample_basename = os.path.basename(sample_basename)
+        for tmp_file in options.ex_extension_files:
+            if sample_basename in tmp_file:
+                extension_file = tmp_file
+                
+        if options.conditions[pkl_idx] not in coverage_pkl_dict:
+            coverage_pkl_dict[options.conditions[pkl_idx]] = []
+            # prepare the dict for the exon upstream extensions
+            for ex in term_ex_us_ex_dict:
+                term_ex_us_ex_dict[ex][options.conditions[pkl_idx]] = []
+
+        if not pkl_file.endswith("pkl"):
+            syserr("[ERROR] No proper pkl file as input: %s\n" % pkl_file)
+            sys.exit(2)
+
+        with open(pkl_file, 'rb') as input:
+            cvg = cPickle.load(input)
+            coverage_pkl_dict[options.conditions[pkl_idx] ].append(cvg)
+
+        number_of_files = len( coverage_pkl_dict[options.conditions[pkl_idx] ] )
+        if extension_file is not None and os.path.isfile(extension_file):
+            with open(extension_file, "r") as  ext_file:
+                for line in ext_file:
+                    F = line.rstrip().split("\t")
+
+                    # # test -->
+                    # if F[0] not in term_ex_us_ex_dict:
+                    #     term_ex_us_ex_dict[F[0]] = {}
+                    #     term_ex_us_ex_dict[F[0]][ options.conditions[pkl_idx] ] = [0 for i in range(number_of_files - 1) ]
+                    # # <-- test
+
+                    # append the extension for all exons found in the file
+                    term_ex_us_ex_dict[F[0]][ options.conditions[pkl_idx] ].append( int(F[1]) )
+        # if an exon was not in the extension file
+        # store 0 as extension
+        for ex in term_ex_us_ex_dict:
+            if len( term_ex_us_ex_dict[ex][options.conditions[pkl_idx] ] ) == number_of_files - 1:
+                term_ex_us_ex_dict[ex][ options.conditions[pkl_idx] ].append(0)
+            elif len( term_ex_us_ex_dict[ex][options.conditions[pkl_idx] ] ) != number_of_files:
+                syserr(("[ERROR] Number of entries for exon extensions of ex %s does " +
+                        "not match current number of processed pickle files for cond %s\n")
+                       % (ex, options.conditions[pkl_idx]))
+                sys.exit(2)
+
+    # Finished reading input
+    syserr("[INFO] %s Finished reading input\n" % time_now())
+    
+    ###
+    # start iterating over all exons:
+    ###
+
+    # stats
+    nr_skipped_exons = 0
+    nr_no_negative_slope_fitted = 0
+    nr_too_little_cov = 0
+    
+    data_entries = []
+
+    for exon in terminal_exons_dict:
+        
+        exon_structure = terminal_exons_dict[ exon ][0:4]
+        chrom = exon_structure[0]
+        strand = exon_structure[1]
+        start = exon_structure[2]
+        end = exon_structure[3]
+
+        polyA_sites = terminal_exons_dict[ exon ][4]
+
+        max_ex_upstream_ext = max( [i for k in term_ex_us_ex_dict[exon] for i in term_ex_us_ex_dict[exon][k] ] )
+        
+        # extend the exon 3' boundary if the most distal site 
+        # end more downstream than the annotated exon end
+        if strand == "+":
+            if isinstance(polyA_sites[-1][0], list) and polyA_sites[-1][-1][1].end > end:
+                end = polyA_sites[-1][-1][1].end
+            if not isinstance(polyA_sites[-1][0], list) and polyA_sites[-1][1].end > end:
+                end = polyA_sites[-1][1].end
+                
+        else:
+            if isinstance(polyA_sites[-1][0], list) and polyA_sites[-1][-1][1].start < start:
+                start = polyA_sites[-1][-1][1].start
+            if not isinstance(polyA_sites[-1][0], list) and polyA_sites[-1][1].start < start:
+                start = polyA_sites[-1][1].start
+            
+        ### merge the coverages of this exon (used for distal site definition)
+        if strand == "+":
+            region = range( (start - max_ex_upstream_ext ), (end + options.ds_reg_for_no_coverage) )
+        else:
+            region = range( (start - options.ds_reg_for_no_coverage), (end + max_ex_upstream_ext) )
+        bp_coverages_dict = {}
+        merged_exons = np.zeros(end - start + options.ds_reg_for_no_coverage + max_ex_upstream_ext)
+        for cond in coverage_pkl_dict:
+            bp_coverages_dict[cond] = []
+            for cvg in coverage_pkl_dict[cond]:
+            
+                # extend the 3' end to be able to check the downstream region as well
+                # extend the 5' start to allow handling of proximal site close to the exon start
+                if strand == "+":
+                    curr_cov_arr = np.array( list( cvg[
+                        HTSeq.GenomicInterval( chrom, 
+                                               (start - max_ex_upstream_ext), 
+                                               (end + options.ds_reg_for_no_coverage), 
+                                               strand)
+                    ]
+                    )
+                    )
+                    
+                else:
+                    # do NOT reverse coverage array because
+                    # it needs to match the region list
+                    curr_cov_arr = np.array( list( cvg[
+                        HTSeq.GenomicInterval( chrom, 
+                                               (start - options.ds_reg_for_no_coverage), 
+                                               (end + max_ex_upstream_ext), 
+                                               strand)
+                    ]
+                    )
+                    )
+                # test
+                # normalize the region so that the maximum is at 100
+                #curr_cov_arr = curr_cov_arr / max(curr_cov_arr) * 100
+                bp_coverages_dict[cond].append(curr_cov_arr)
+                merged_exons = merged_exons + curr_cov_arr
+
+        data_entries.append((exon, exon_structure, region, bp_coverages_dict, merged_exons, polyA_sites, term_ex_us_ex_dict[exon], options))
+        
+    result_tuples = pool.map( process_exon_wrapper_function, data_entries)
+
+    with open(expression_output_file_name, "w") as ofile:
+        with open( distal_sites_output_file_name, "w") as distal_out:
+
+            for res_tu in result_tuples:
+                exon = res_tu[0]
+                used_sites = res_tu[1]
+                rel_use_array = res_tu[2]
+                expression_array = res_tu[3]
+                if len(used_sites) == 0 and len(rel_use_array) == 0:
+                    continue
+                elif len(used_sites) == 0 or (rel_use_array is not None and len(rel_use_array) == 0):
+                    syserr(("[ERROR] length of list with inferred sites: %i\n" +
+                            "length of list with relative usages: %i\n")
+                           % (len(used_sites), len(rel_use_array)))
+                    syserr("[ERROR] Only the case of both lists have zero length " +
+                       "is expected\n")
+                    sys.exit(2)
+                elif len(used_sites) == 1:
+                    # sanity check
+                    if rel_use_array is not None:
+                        syserr("[ERROR] Only distal site was inferred but quantification failed\n")
+                        sys.exit(2)
+                    expression_vals = "\t".join(str(x) for x in expression_array[0])
+                    total_number_of_sites = len(used_sites)
+                    transcript = exon.split(":")[0]
+                    site = used_sites[0]
+                    distal_out.write("%s\t%i\t%i\t%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\n" % (site[1].chrom,
+                                                                                       site[1].start,
+                                                                                       site[1].end,
+                                                                                       site[0],
+                                                                                       site[2],
+                                                                                       site[1].strand,
+                                                                                       1,
+                                                                                       total_number_of_sites,
+                                                                                       exon,
+                                                                                       transcript,
+                                                                                       expression_vals))
+                else: 
+                    for idx in range(len(rel_use_array)):
+                        rel_use_array[idx] = "\t".join(str(x) for x in rel_use_array[idx])
+                        expression_array[idx] = "\t".join(str(x) for x in expression_array[idx])
+                    total_number_of_sites = len(used_sites)
+                    transcript = exon.split(":")[0]
+                    cnt = 0
+                    for site_idx in range(len(used_sites)):
+                        site = used_sites[site_idx]
+                        cnt += 1
+                        sysout("%s\t%i\t%i\t%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\n" % (site[1].chrom,
+                                                                             site[1].start,
+                                                                             site[1].end,
+                                                                             site[0],
+                                                                             site[2],
+                                                                             site[1].strand,
+                                                                             cnt,
+                                                                             total_number_of_sites,
+                                                                             exon,
+                                                                             transcript,
+                                                                             rel_use_array[site_idx]))
+    
+                        ofile.write("%s\t%i\t%i\t%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\n" % (site[1].chrom,
+                                                                                      site[1].start,
+                                                                                      site[1].end,
+                                                                                      site[0],
+                                                                                      site[2],
+                                                                                      site[1].strand,
+                                                                                      cnt,
+                                                                                      total_number_of_sites,
+                                                                                      exon,
+                                                                                      transcript,
+                                                                                      expression_array[site_idx]))
+
+    # with open(expression_output_file_name, "w") as ofile:
+    #     with open( distal_sites_output_file_name, "w") as distal_out:
+    #         for dat in data_entries:
+    #             res_tu = process_exon_wrapper_function(dat)
+    #             exon = res_tu[0]
+    #             used_sites = res_tu[1]
+    #             rel_use_array = res_tu[2]
+    #             expression_array = res_tu[3]
+    #             if len(used_sites) == 0 and len(rel_use_array) == 0:
+    #                 continue
+    #             elif len(used_sites) == 0 or (rel_use_array is not None and len(rel_use_array) == 0):
+    #                 syserr(("[ERROR] length of list with inferred sites: %i\n" +
+    #                         "length of list with relative usages: %i\n")
+    #                        % (len(used_sites), len(rel_use_array)))
+    #                 syserr("[ERROR] Only the case of both lists have zero length " +
+    #                        "is expected\n")
+    #                 sys.exit(2)
+    #             elif len(used_sites) == 1:
+    #                 # sanity check
+    #                 if rel_use_array is not None:
+    #                     syserr("[ERROR] Only distal site was inferred but quantification failed\n")
+    #                     sys.exit(2)
+    #                 expression_vals = "\t".join(str(x) for x in expression_array[0])
+    #                 total_number_of_sites = len(used_sites)
+    #                 transcript = exon.split(":")[0]
+    #                 site = used_sites[0]
+    #                 distal_out.write("%s\t%i\t%i\t%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\n" % (site[1].chrom,
+    #                                                                                    site[1].start,
+    #                                                                                    site[1].end,
+    #                                                                                    site[0],
+    #                                                                                    site[2],
+    #                                                                                    site[1].strand,
+    #                                                                                    1,
+    #                                                                                    total_number_of_sites,
+    #                                                                                    exon,
+    #                                                                                    transcript,
+    #                                                                                    expression_vals))
+    #             else: 
+    #                 for idx in range(len(rel_use_array)):
+    #                     rel_use_array[idx] = "\t".join(str(x) for x in rel_use_array[idx])
+    #                 total_number_of_sites = len(used_sites)
+    #                 transcript = exon.split(":")[0]
+    #                 cnt = 0
+    #                 for site_idx in range(len(used_sites)):
+    #                     site = used_sites[site_idx]
+    #                     cnt += 1
+    #                     sysout("%s\t%i\t%i\t%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\n" % (site[1].chrom,
+    #                                                                          site[1].start,
+    #                                                                          site[1].end,
+    #                                                                          site[0],
+    #                                                                          site[2],
+    #                                                                          site[1].strand,
+    #                                                                          cnt,
+    #                                                                          total_number_of_sites,
+    #                                                                          exon,
+    #                                                                          transcript,
+    #                                                                          rel_use_array[site_idx]))
+        
+    #                     ofile.write("%s\t%i\t%i\t%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\n" % (site[1].chrom,
+    #                                                                                   site[1].start,
+    #                                                                                   site[1].end,
+    #                                                                                   site[0],
+    #                                                                                   site[2],
+    #                                                                                   site[1].strand,
+    #                                                                                   cnt,
+    #                                                                                   total_number_of_sites,
+    #                                                                                   exon,
+    #                                                                                   transcript,
+    #                                                                                   expression_array[site_idx]))
+
+
+if __name__ == '__main__':
+    try:
+        try:
+            options = parser.parse_args()
+        except Exception, e:
+            parser.print_help()
+            sys.exit()
+        if options.verbose:
+            start_time = time.time()
+            start_date = time.strftime("%d-%m-%Y at %H:%M:%S")
+            syserr("############## Started script on %s ##############\n" %
+                   start_date)
+
+        main(options)
+        if options.verbose:
+            syserr("### Successfully finished in %i seconds, on %s ###\n" %
+                   (time.time() - start_time,
+                    time.strftime("%d-%m-%Y at %H:%M:%S")))
+    except KeyboardInterrupt:
+        syserr("Interrupted by user after %i seconds!\n" %
+               (time.time() - start_time))
+        sys.exit(-1)
-- 
GitLab