diff --git a/.Rhistory b/.Rhistory new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..4ca5d5f59152ee9c7625c42825def1439bb8293d --- /dev/null +++ b/Dockerfile @@ -0,0 +1,18 @@ +##### BASE ##### +FROM python:3.10-slim-buster + +##### VARIABLES ##### +WORKDIR /Users/terminal-fragment-selector/ + +COPY requirements.txt /Users/terminal-fragment-selector/requirements.txt +COPY requirements_dev.txt /Users/terminal-fragment-selector/requirements_dev.txt + +##### INSTALL ##### +RUN apt-get update \ +&& apt-get install gcc -y \ +&& apt-get clean +RUN pip install -r requirements.txt -r requirements_dev.txt + + + + diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..5f6e6c8270054060826d62670ee80687d936f42c --- /dev/null +++ b/README.md @@ -0,0 +1,48 @@ +# Terminal fragment selecting +Simulating single cell RNA library generation (scRNA-seq) + +This repository is as part of the uni basel course <E3: Programming for Life Science – 43513>. To test the accuracy of scRNA-seq data we generated the *synthetic data*. That is, we reconstruct the properties of the experimental data set and determine whether the computational analysis can recover properties of the data that was assumed in the simulation. This is never trivial since setting the ground truth is much needed in the computational method to evaluate the result. + +# Synopsis +As part of the sub-project, we implemented python code for selecting terminal fragments. Detailed distribution used for the selecting fragments can be found below. [title](https://www.nature.com/articles/srep04532#MOESM1) +> Next Generation Sequencing (NGS) technology is based on cutting DNA into small fragments and their massive parallel sequencing. The multiple overlapping segments termed “reads†are assembled into a contiguous sequence. To reduce sequencing errors, every genome region should be sequenced several dozen times. This sequencing approach is based on the assumption that genomic DNA breaks are random and sequence-independent. However, previously we showed that for the sonicated restriction DNA fragments the rates of double-stranded breaks depend on the nucleotide sequence. In this work we analyzed genomic reads from NGS data and discovered that fragmentation methods based on the action of the hydrodynamic forces on DNA, produce similar bias. Consideration of this non-random DNA fragmentation may allow one to unravel what factors and to what extent influence the non-uniform coverage of various genomic regions. + +As a whole project, we implemented a procedure for sampling reads from mRNA sequences, incorporating a few sources of “noiseâ€. These include the presence of multiple transcript isoforms from a given gene, some that are incompletely spliced, stochastic binding of primers to RNA fragments and stochastic sampling of DNA fragments for sequencing. We will then use standard methods to estimate gene expression from the simulated data. We will repeat the process multiple times, each time corresponding to a single cell. We will then compare the estimates obtained from the simulated cells with the gene expression values assumed in the simulation. We will also try to explore which steps in the sample preparation have the largest impact on the accuracy of gene expression estimates. + + +# Usage +Input: +- fasta_file +- counts_file +- sep +"""Takes as input FASTA file of cDNA sequences, a CSV/TSV with sequence counts, and mean and std. dev. of fragment lengths and 4 nucleotide probabilities for the cuts. Outputs most terminal fragment (within desired length range) for each sequence.""" + +Output: +- fasta dict and sequence for counts file, n_cuts + +To install package, run + +``` +pip install -r requirements.txt +pip install -r requirements_dev.txt +``` + + +# Development + +To build Docker image, run + +``` +docker build -t terminal_fragment_selector . +``` + +Afterwards, you can use the Docker image like so + + +# License + +MIT license, Copyright (c) 2021 Zavolan Lab, Biozentrum, University of Basel + +# Contact +zavolab-biozentrum@unibas.ch + diff --git a/frag_selec.nf b/frag_selec.nf new file mode 100644 index 0000000000000000000000000000000000000000..1fc9bdc7290f3a18d3aa28d66a6f82a962110b5a --- /dev/null +++ b/frag_selec.nf @@ -0,0 +1,126 @@ +#!/usr/bin/env nextflow + +nextflow.enable.dsl=2 + +```c +/* + * Define the input parameters"""Takes as input FASTA file + of cDNA sequences, a CSV/TSV with sequence + counts, and mean and std. dev. of fragment + lengths and 4 nucleotide probabilities + for the cuts. Outputs most terminal + fragment (within desired length range) + for each sequence.""" + */ +params.fasta_file = "$projectDir/tests/test_files/test,fasta" +params.counts_file = "$projectDir/tests/test_files/test.csv" +params.sep = "$projectDir/data/yeast/sep/sep.csv" +params.outdir = "results" + + +/* Log some information for the user */ + +log.info """\ + R N A S E Q - N F P I P E L I N E + =================================== + fasta_file : ${params.fasta_file} + counts_file : ${params.counts_file} + outdir : ${params.outdir} + """ + .stripIndent() + +/* + * Define the `file_validation` process: + * Validate input files exist and are the correct format + */ + +process file_validation { + + input: + path fasta_file + path counts_file + path sep + + output: + tuple: fasta dict and sequence for counts file + + script: + """ + salmon index --threads $task.cpus -t $transcriptome -i index + """ +} + + +/* + * Define the get_cut_number process: + * Get the number of cuts for a particular sequence + */ + +process get_cut_number { + + tag "get_cut_numbern on $n_cuts" + publishDir "${params.outdir}/get_cut_number", mode:'copy' + + input: + path index + tuple val(n_cuts), path(seq_len, mean) + + output: + path(n_cuts) + + script: + """ + + """ +} + + +/* + * Define the fragmentation process: + * Fragment cDNA sequences and select terminal fragment + */ + +process fragmentation { + + tag "fragmentation on $fasta, seq_counts, nuc_probs, mu_length, std" + + input: + dict val(fasta), pd.DataFrame(seq_counts), dict(nuc_probs),int(mu_length),int(std) + + output: + path("term_frags") + + script: + """ + mkdir fastqc_${sample_id}_logs + + """ +} + + + +/* Start the job: + * initialize variables + */ + +Channel + .fromFilePairs( params.reads, checkIfExists:true ) + .set { read_pairs_ch } + + +/* The "main" function: + * Use CLI arguments to fragment sequences and output text file with selected terminal fragments + */ + +workflow { + file_validation_ch=file_validation(params.fasta_file, params.counts_file, params.sep) + get_cut_number_ch = get_cut_number(seq_len, mean) + framentation_ch = fregmentation(fasta, seq_counts, nuc_probs, mu_length, std) } + + +/* Book keeping upon workflow completion */ +workflow.onComplete { + log.info (workflow.success ? "\nDone! Open the following report in your browser --> $params.outdir/multiqc/multiqc_report.html\n" : "Oops .. something went wrong") + ) +} +``` diff --git a/requirements.txt b/requirements.txt index 70de6ebc12f5dc4d32809ef34959efe77fbf47ea..8a9674c77d5ed060f30942792d2e3f14540bd1a0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ argparse -biopython >= 1.78 +biopython numpy >= 1.23.3 pandas >= 1.4.4 \ No newline at end of file