schwede
openstructure

Repository

git clone https://git.scicore.unibas.ch/zavolan_public/A-seq2-processing.git path/to/workdir
cd path/to/workdir
wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
bash Miniconda3-latest-Linux-x86_64.sh
wget https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
bash Miniconda3-latest-MacOSX-x86_64.sh
conda create -n Aseq-snakemake -c bioconda -c conda-forge -c ostrokach --file requirements.txt
conda create -n Aseq-snakemake -c bioconda -c conda-forge -c ostrokach --file requirements.txt python=3
source activate Aseq-snakemake
source deactivate
wget -P resources ftp://ftp.ensembl.org/pub/release-87/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa.gz
gzip -d resources/Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa.gz
python scripts/complete-jobscript.py -i jobscript.sh -o jobscript.withJOBID.withPATH.sh
max_cores=4 # maximum number of threads that will run in parallel
# if not yet done: source activate Aseq-snakemake
snakemake -p --cores ${max_cores}
max_cores=4 # maximum number of threads that will run in parallel
snakemake -p --cores ${max_cores} &> log_output.log
max_jobs=100 # maximum number of concurrently submitted jobs
snakemake \
  --cluster-config cluster_config.json \
  --jobscript jobscript.withJOBID.withPATH.sh \
  --cluster "qsub -pe smp {cluster.threads} \
    -cwd \
    -l membycore={cluster.mem} \
    -R y \
    -l runtime={cluster.time} \
    -o {params.cluster_log} \
    -j y" \
    --jobs ${max_jobs}
perl scripts/ag-convert-FASTQ-to-fasta.pl \
     --fastq_to_fasta=fastq_to_fasta \
     --fastx_renamer=fastx_renamer \
     --oufile=test.fa.gz \
     test.fq.gz
adapter="....TTT"
zcat test.fa.gz \
     | perl scripts/rs-filter-by-5p-adapter.keep5pAdapter.pl \
     --adapter=${adapter} \
     | gzip > test.5ptrimmed.UMI_available.fa.gz
zcat test.5ptrimmed.UMI_available.fa.gz \
     | perl scripts/rs-collapse-UMIduplicates.keepUMI.pl \
     | gzip > test.UMI_dupl_removed.fa.gz
adapt="TGGAATTCTCGGGTGCCAAGG"
minLen=15
cutadapt -a ${adapt} \
    --minimum-length ${minLen} \
    test.UMI_dupl_removed.fa.gz \
    | fastx_reverse_complement \
    | gzip > test.trimmed.UMI.fa.gz
maxN=2
maxAcontent=0.8
zcat test.trimmed.UMI.fa.gz \
    | perl scripts/ag-filter-seqs-by-nucleotide-composition.pl --max ${maxN} --nuc N \
    | perl scripts/ag-filter-seqs-by-nucleotide-composition.pl --max ${maxAcontent} --nuc A \
    | perl scripts/ag-filter-seqs-by-last-nuc.pl \
    | gzip > test.valid.trimmed.UMI.fa.gz
genome="resources/Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa"
gtf="resources/Homo_sapiens.GRCh38.87.gtf"
idx_dir="resources/Homo_sapiens.GRCh38.50nt.index"

if [[ ! -d resources/Homo_sapiens.GRCh38.50nt.index ]]; then mkdir resources/Homo_sapiens.GRCh38.50nt.index; fi
STAR \
    --runMode genomeGenerate \
    --sjdbOverhang 52 \
    --genomeDir ${idx_dir} \
    --genomeFastaFiles ${genome} \
    --runThreadN 8 \
    --sjdbGTFfile ${gtf}
anno="resources/Homo_sapiens.GRCh38.87.gtf"
outdir="test.STAR_out/"
idx_dir="resources/Homo_sapiens.GRCh38.50nt.index"

if [[ ! -d test.STAR_out ]]; then mkdir test.STAR_out/; fi
STAR \
    --runMode alignReads \
    --twopassMode Basic \
    --runThreadN 8 \
    --genomeDir ${idx_dir} \
    --sjdbGTFfile ${anno} \
    --readFilesIn test.valid.trimmed.UMI.fa.gz \
    --outFileNamePrefix ${outdir} \
    --outSAMtype BAM SortedByCoordinate \
    --limitBAMsortRAM 31532137230 \
    --readFilesCommand zcat \
    --outSAMstrandField intronMotif \
    --alignIntronMax 200000 \
    --outReadsUnmapped Fastx \
    --outSAMattributes All \
    --alignEndsType EndToEnd
samtools view test.STAR_out/Aligned.sortedByCoord.out.bam \
    | python scripts/rs-bam2bed.py \
    --processors 8 \
    | gzip > test.reads.bed.gz
python scripts/rs-select-min-edit-distance.py \
       --bed test.reads.bed.gz \
        | gzip > test.edit_distance_filtered.bed.gz
perl scripts/rs-collapse-mapped-UMIduplicates.pl \
     --count_out=counts/test.mapped.nr.out \
     | gzip > test.reads.collapsed.bed.gz
min_align=4

perl scripts/rs-get-3pEnds-from-bed.pl \
     --exclude=Y --exclude=M \
     --strict \
     --min_align=${min_align} \
     test.reads.collapsed.bed.gz \
     | gzip > test.3pSites.bed.gz
upstream_ext=0
downstream_ext=10
genome="resources/Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa"

perl scripts/rs-fetch-flanking-fasta.pl \
     --genome=${genome} \
     --upstream=${upstream_ext} \
     --downstream=${downstream_ext} \
     test.3pSites.bed.seqs.gz \
     | gzip > test.3pSites.bed.seqs.gz
upstream_ext=0
downstream_ext=10
tot_As=7
consec_As=6

perl scripts/ag-assign-internal-priming-sites.pl \
     --upstream_len=${upstream_ext} \
     --downstream_len=${downstream_ext} \
     --consecutive_As=${consec_As} \
     --total_As=${tot_As} \
     --ds_pattern=AAAA --ds_pattern=AGAA --ds_pattern=AAGA --ds_pattern=AAAG \
     test.3pSites.bed.seqs.gz \
     | gzip > test.3pSites.ip.bed.gz
zcat test.3pSites.ip.bed.gz \
     | perl -ne 'my @F = split("\t"); print $_ if($F[3]) ne IP;' \
     | gzip > test.3pSites.noIP.bed.gz
gtf="resources/Homo_sapiens.GRCh38.87.gtf"
utr_name="three_prime_utr"

python scripts/rs-sites-to-feature-annotation.py \
       --verbose \
       --gtf ${gtf} \
       --utr_name ${utr_name} \
       --bed test.3pSites.noIP.bed.gz
       > counts/annotation_overlap/test.raw_3p_ends.noIP.summary.tsv
perl scripts/ag-pool-sites.pl \
     --noip \
     test.3pSites.ip.bed.gz \
     | gzip > 3pSites.tsv.gz
genome="resources/Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa"

perl scripts/ag-assign-polyAsignals.pl \
     --motif=AATAAA --motif=ATTAAA # only 2 out of 18 motifs are listed here exemplarily \
     --genome=${genome} \
     3pSites.tsv.gz \
     | gzip > 3pSites.PAS.tsv.gz
cutoff=90
upstream=25
downstream=25

perl scripts/rs-find-sample-specific-cutoff.pl \
     --cutoff=${cutoff} \
     --upstream=${upstream} \
     --downstream=${downstream} \
     --sample = test \
     3pSites.PAS.tsv.gz \
     > filteredSites/test.filtered.tsv
upstream_ext=12
downstream_ext=12

perl scripts/ag-generate-clusters.pl \
     --upstream=${upstream_ext} \
     --downstream=${downstream_ext} \
     3pSites.PAS.noBG.tsv.gz \
     | gzip > clusters.primary.tsv.gz
maxsize=25
minDistToPAS=15

perl scripts/ag-merge-clusters.pl \
     --maxsize=${maxsize} \
     --minDistToPAS=${minDistToPAS} \
     clusters.primary.tsv.gz \
     | gzip >
perl scripts/ag-clusters-to-BED.pl clusters.merged.tsv.gz clusters.merged.bed
gtf="resources/Homo_sapiens.GRCh38.87.gtf"
utr_name="three_prime_utr"

python scripts/rs-clusters-to-feature-annotation.py \
       --verbose \
       --gtf ${gtf} \
       --utr_name ${utr_name} \
       --bed clusters.merged.bed \
       > counts/annotation_overlap/clusters.summary.tsv