Skip to content
Snippets Groups Projects
Commit c535a892 authored by BIOPZ-Gypas Foivos's avatar BIOPZ-Gypas Foivos
Browse files

- Delete unused files scripts/fg_extract_transcripts.py,...

- Delete unused files scripts/fg_extract_transcripts.py, scripts/heatmap_and_clustermap.py, scripts/perform_PCA.py
- Add rules (pca_kallisto, pca_salmon) that run zpca (https://github.com/zavolanlab/zpca) on genes and transcripts TPM tables from kallisto and salmon.
- The output is wired to multiqc_report but the plots are not visualized to multiqc. Update documentation.
- Update dag and rulegraph.
Fixes #140 #142
parent 2bdcf390
No related branches found
No related tags found
No related merge requests found
......@@ -80,7 +80,6 @@ rule finish:
"summary_kallisto",
"genes_tpm.tsv")
rule start:
'''
Get samples
......@@ -802,10 +801,14 @@ rule kallisto_merge_genes:
gtf = get_sample('gtf')
output:
gn_out = os.path.join(
gn_tpm = os.path.join(
config["output_dir"],
"summary_kallisto",
"genes_tpm.tsv")
"genes_tpm.tsv"),
gn_counts = os.path.join(
config["output_dir"],
"summary_kallisto",
"genes_counts.tsv")
params:
dir_out = os.path.join(
......@@ -868,10 +871,14 @@ rule kallisto_merge_transcripts:
for i in pd.unique(samples_table.index.values)]),
output:
tx_out = os.path.join(
tx_tpm = os.path.join(
config["output_dir"],
"summary_kallisto",
"transcripts_tpm.tsv")
"transcripts_tpm.tsv"),
tx_counts = os.path.join(
config["output_dir"],
"summary_kallisto",
"transcripts_counts.tsv")
params:
dir_out = os.path.join(
......@@ -911,6 +918,87 @@ rule kallisto_merge_transcripts:
1> {log.stdout} 2> {log.stderr}"
rule pca_salmon:
input:
tpm = os.path.join(
config["output_dir"],
"summary_salmon",
"quantmerge",
"{molecule}_tpm.tsv"),
params:
tpm_filter = "0",
tpm_pseudocount = "1"
output:
out = directory(os.path.join(
config["output_dir"],
"zpca",
"pca_salmon_{molecule}"))
log:
stderr = os.path.join(
config["log_dir"],
"pca_salmon_{molecule}.stderr.log"),
stdout = os.path.join(
config["log_dir"],
"pca_salmon_{molecule}.stdout.log")
threads: 1
singularity:
"docker://zavolab/zpca:0.8"
shell:
"(zpca-tpm \
--tpm {input.tpm} \
--tpm-filter {params.tpm_filter} \
--tpm-pseudocount {params.tpm_pseudocount} \
--out {output.out} \
--verbose) \
1> {log.stdout} 2> {log.stderr}"
rule pca_kallisto:
input:
tpm = os.path.join(
config["output_dir"],
"summary_kallisto",
"{molecule}_tpm.tsv")
params:
tpm_filter = "0",
tpm_pseudocount = "1"
output:
out = directory(os.path.join(
config["output_dir"],
"zpca",
"pca_kallisto_{molecule}"))
log:
stderr = os.path.join(
config["log_dir"],
"pca_kallisto_{molecule}.stderr.log"),
stdout = os.path.join(
config["log_dir"],
"pca_kallisto_{molecule}.stdout.log")
threads: 1
singularity:
"docker://zavolab/zpca:0.8"
shell:
"(zpca-tpm \
--tpm {input.tpm} \
--tpm-filter {params.tpm_filter} \
--tpm-pseudocount {params.tpm_pseudocount} \
--out {output.out} \
--verbose) \
1> {log.stdout} 2> {log.stderr}"
rule star_rpm:
'''
Create stranded bedgraph coverage with STARs RPM normalisation
......@@ -1230,7 +1318,6 @@ rule alfa_qc:
-s {params.alfa_orientation}) &> {log}"
# cd {params.out_dir};
rule alfa_qc_all_samples:
'''
Run ALFA from stranded bedgraph files on all samples
......@@ -1397,6 +1484,19 @@ rule multiqc_report:
"ALFA",
"ALFA_plots_mqc.png"),
zpca_salmon = expand(os.path.join(
config["output_dir"],
"zpca",
"pca_salmon_{molecule}"),
molecule=["genes", "transcripts"]),
zpca_kallisto = expand(os.path.join(
config["output_dir"],
"zpca",
"pca_kallisto_{molecule}"),
molecule=["genes", "transcripts"]
),
multiqc_config = os.path.join(
config["output_dir"],
"multiqc_config.yaml")
......
This diff is collapsed.
This diff is collapsed.
......@@ -32,6 +32,8 @@ on installation and usage please see [here](README.md).
- [**salmon_quantmerge_transcripts**](#salmon_quantmerge_transcripts)
- [**kallisto_merge_genes**](#kallisto_merge_genes)
- [**kallisto_merge_transcripts**](#kallisto_merge_transcripts)
- [**pca_kallisto**](#pca_kallisto)
- [**pca_salmon**](#pca_salmon)
- [**generate_alfa_index**](#generate_alfa_index)
- [**alfa_qc**](#alfa_qc)
- [**alfa_qc_all_samples**](#alfa_qc_all_samples)
......@@ -400,7 +402,7 @@ Merge gene-level expression estimates for all samples with
- Gene TPM table (custom `.tsv`); used in
[**multiqc_report**](#multiqc_report)
- Gene read count table (custom `.tsv`); used in
[**multiqc_report**](#multiqc_report)
[**pca_salmon**](#pca_salmon) and [**pca_kallisto**](#pca_kallisto)
#### `salmon_quantmerge_transcripts`
......@@ -416,7 +418,7 @@ Merge transcript-level expression estimates for all samples with
- Transcript TPM table (custom `.tsv`); used in
[**multiqc_report**](#multiqc_report)
- Transcript read count table (custom `.tsv`); used in
[**multiqc_report**](#multiqc_report)
[**pca_salmon**](#pca_salmon) and [**pca_kallisto**](#pca_kallisto)
#### `kallisto_merge_genes`
......@@ -448,6 +450,28 @@ Merge transcript-level expression estimates for all samples with
- Transcript TPM table (custom `.tsv`)
- Transcript read count table (custom `.tsv`)
#### `pca_kallisto`
Run PCA analysis on kallisto genes and transcripts with [custom script][custom-script-zpca].
> Rule is run one time for transcript estimates and one time for genes estimates
- **Input**
- Transcript/Genes TPM table (custom `.tsv`)
- **Output**
- Directory with PCA plots, scree plot and top loading scores.
#### `pca_salmon`
Run PCA analysis on salmon genes and transcripts with [custom script][custom-script-zpca].
> Rule is run one time for transcript estimates and one time for genes estimates
- **Input**
- Transcript/Genes TPM table (custom `.tsv`)
- **Output**
- Directory with PCA plots, scree plot and top loading scores.
#### `generate_alfa_index`
Create index for [**ALFA**](#third-party-software-used).
......@@ -714,6 +738,7 @@ Generate pseudoalignments of reads to transcripts with
[custom-script-gtf-to-bed12]: <https://git.scicore.unibas.ch/zavolan_group/tools/gtf_transcript_type_to_bed12>
[custom-script-tin]: <https://git.scicore.unibas.ch/zavolan_group/tools/tin_score_calculation>
[custom-script-merge-kallisto]: <https://github.com/zavolanlab/merge_kallisto>
[custom-script-zpca]: <https://github.com/zavolanlab/zpca>
[docs-alfa]: <https://github.com/biocompibens/ALFA#manual>
[docs-bedgraphtobigwig]: <http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/>
[docs-bedtools]: <https://bedtools.readthedocs.io/en/latest/>
......
# _____________________________________________________________________________
# -----------------------------------------------------------------------------
# import needed (external) modules
# -----------------------------------------------------------------------------
try:
import HTSeq
except(Exception):
raise("[ERROR] HTSeq was not imported properly. Exiting.")
sys.exit(-1)
try:
from argparse import ArgumentParser, RawTextHelpFormatter, FileType
except(Exception):
raise("[ERROR] argparse was not imported properly. Exiting.")
sys.exit(-1)
try:
import sys
except(Exception):
raise("[ERROR] sys was not imported properly. Exiting.")
sys.exit(-1)
# _____________________________________________________________________________
# -----------------------------------------------------------------------------
# Main function
# -----------------------------------------------------------------------------
def main():
""" Main function """
__doc__ = "Filter gtf file based on the transcript biotype."
parser = ArgumentParser(description=__doc__, formatter_class=RawTextHelpFormatter)
parser.add_argument("--gtf",
dest = "gtf",
help = "Annotation file in GTF format with transcript support level information",
required = True,
metavar = "FILE")
parser.add_argument("--out",
dest = "out",
help = "GTF output file",
required = True,
metavar="FILE")
parser.add_argument("--transcript_biotype",
dest = "transcript_biotype",
help = "Transcript biotype or transcript type to include",
required = True,
default = "protein_coding,lincRNA")
parser.add_argument("-v",
"--verbose",
action = "store_true",
dest = "verbose",
default = False,
required = False,
help = "Verbose")
# _________________________________________________________________________
# -------------------------------------------------------------------------
# get the arguments
# -------------------------------------------------------------------------
try:
options = parser.parse_args()
except(Exception):
parser.print_help()
if len(sys.argv) == 1:
parser.print_help()
sys.exit(1)
if options.verbose:
sys.stdout.write("Parsing gtf file and keep %s transcripts\n" % (str(options.transcript_biotype)))
list_of_transcript_types = str(options.transcript_biotype).split(",")
# parse gtf file
gtf = HTSeq.GFF_Reader(options.gtf)
# open output file
w = open(options.out, 'w')
for gtf_line in gtf:
if "transcript_biotype" in gtf_line.attr:
if gtf_line.attr['transcript_biotype'] in list_of_transcript_types:
w.write(gtf_line.get_gff_line())
if "transcript_type" in gtf_line.attr:
if gtf_line.attr['transcript_type'] in list_of_transcript_types:
w.write(gtf_line.get_gff_line())
w.close()
# _____________________________________________________________________________
# -----------------------------------------------------------------------------
# Call the Main function and catch Keyboard interrups
# -----------------------------------------------------------------------------
if __name__ == '__main__':
try:
main()
except KeyboardInterrupt:
sys.stderr.write("User interrupt!\n")
sys.exit(0)
#!/usr/bin/env python
# _____________________________________________________________________________
# -----------------------------------------------------------------------------
# import needed (external) modules
# -----------------------------------------------------------------------------
import sys
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from argparse import ArgumentParser, RawTextHelpFormatter
# _____________________________________________________________________________
# -----------------------------------------------------------------------------
# Main function
# -----------------------------------------------------------------------------
def main():
""" Main function """
__doc__ = "Create heatmap and clustermap based on a table (rows are genes/transcripts, columns are samples)."
parser = ArgumentParser(
description=__doc__,
formatter_class=RawTextHelpFormatter
)
parser.add_argument(
"--tpm",
dest="tpm",
help="TPM table (tsv)",
required=True,
metavar="FILE"
)
parser.add_argument(
"--out",
dest="out",
help="Output directory",
required=True,
metavar="FILE"
)
parser.add_argument(
"-v",
"--verbose",
action="store_true",
dest="verbose",
default=False,
required=False,
help="Verbose"
)
# _________________________________________________________________________
# -------------------------------------------------------------------------
# get the arguments
# -------------------------------------------------------------------------
try:
options = parser.parse_args()
except(Exception):
parser.print_help()
if len(sys.argv) == 1:
parser.print_help()
sys.exit(1)
if options.verbose:
sys.stdout.write(f"Creating output directory: {options.out} {os.linesep}")
if not os.path.exists(options.out):
os.makedirs(options.out)
if options.verbose:
sys.stdout.write(f"Reading: {options.tpm} {os.linesep}")
df = pd.read_csv(options.tpm, header=0, sep="\t")
df.set_index(["Name"], inplace=True)
df = df + 1/32
df = np.log2(df)
if options.verbose:
sys.stdout.write(f"Generating heatmap {os.linesep}")
plt.rcParams["figure.figsize"] = (20,20)
g = sns.heatmap(df.corr(method="pearson"))
plt.savefig(os.path.join(options.out, "heatmap.pdf"))
if options.verbose:
sys.stdout.write(f"Generating clustermap {os.linesep}")
plt.rcParams["figure.figsize"] = (20,20)
g = sns.clustermap(df.corr(method="pearson"))
plt.savefig(os.path.join(options.out, "clustermap.pdf"))
if options.verbose:
sys.stdout.write(f"Done {os.linesep}")
# _____________________________________________________________________________
# -----------------------------------------------------------------------------
# Call the Main function and catch Keyboard interrups
# -----------------------------------------------------------------------------
if __name__ == '__main__':
try:
main()
except KeyboardInterrupt:
sys.stderr.write("User interrupt!" + os.linesep)
sys.exit(0)
#!/usr/bin/env python
# _____________________________________________________________________________
# -----------------------------------------------------------------------------
# import needed (external) modules
# -----------------------------------------------------------------------------
import sys
import os
import pandas as pd
import numpy as np
import random as rd
from sklearn.decomposition import PCA
from sklearn import preprocessing
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from argparse import ArgumentParser, RawTextHelpFormatter
# _____________________________________________________________________________
# -----------------------------------------------------------------------------
# Main function
# -----------------------------------------------------------------------------
def main():
""" Main function """
__doc__ = "Perform PCA based on a table (rows are genes/transcripts, columns are samples)."
parser = ArgumentParser(
description=__doc__,
formatter_class=RawTextHelpFormatter
)
parser.add_argument(
"--tpm",
dest="tpm",
help="TPM table (tsv)",
required=True,
metavar="FILE"
)
parser.add_argument(
"--out",
dest="out",
help="Output directory",
required=True,
metavar="FILE"
)
parser.add_argument(
"-v",
"--verbose",
action="store_true",
dest="verbose",
default=False,
required=False,
help="Verbose"
)
# _________________________________________________________________________
# -------------------------------------------------------------------------
# get the arguments
# -------------------------------------------------------------------------
try:
options = parser.parse_args()
except(Exception):
parser.print_help()
if len(sys.argv) == 1:
parser.print_help()
sys.exit(1)
if options.verbose:
sys.stdout.write(f"Creating output directory: {options.out} {os.linesep}")
if not os.path.exists(options.out):
os.makedirs(options.out)
if options.verbose:
sys.stdout.write(f"Reading: {options.tpm} {os.linesep}")
df = pd.read_csv(options.tpm, header=0, sep="\t")
df.set_index(["Name"], inplace=True)
if options.verbose:
sys.stdout.write(f"Performing PCA {os.linesep}")
scaled_data = preprocessing.scale(df.T)
pca = PCA()
pca.fit(scaled_data)
pca_data = pca.transform(scaled_data)
per_var = np.round(pca.explained_variance_ratio_* 100, decimals=1)
labels = ['PC' + str(x) for x in range(1, len(per_var)+1)]
plt.figure()
plt.rcParams["figure.figsize"] = (20,20)
plt.bar(x=range(1,len(per_var)+1), height=per_var, tick_label=labels)
plt.ylabel('Percentage of Explained Variance')
plt.xlabel('Principal Component')
plt.title('Scree Plot')
plt.xticks(rotation='vertical')
plt.savefig(os.path.join(options.out, "scree_plot.pdf"))
plt.close()
pca_df = pd.DataFrame(pca_data, index=[*df], columns=labels)
if options.verbose:
sys.stdout.write(f"Generating plots in: {options.out} {os.linesep}")
# -------------------------------------------------------------------------
# PCA 1st and 2nd component
# -------------------------------------------------------------------------
plt.figure()
plt.rcParams["figure.figsize"] = (20,20)
plt.scatter(pca_df.PC1, pca_df.PC2)
plt.xlabel('PC1 - {0}%'.format(per_var[0]), fontsize=22)
plt.ylabel('PC2 - {0}%'.format(per_var[1]), fontsize=22)
plt.title('PCA components 1 & 2', fontsize=22)
for sample in pca_df.index:
plt.annotate(sample, (pca_df.PC1.loc[sample], pca_df.PC2.loc[sample]))
plt.savefig(os.path.join(options.out, "PCA_1_2.pdf"))
plt.close()
# -------------------------------------------------------------------------
# PCA 2nd and 3rd component
# -------------------------------------------------------------------------
plt.figure()
plt.rcParams["figure.figsize"] = (20,20)
plt.scatter(pca_df.PC2, pca_df.PC3)
plt.xlabel('PC2 - {0}%'.format(per_var[1]), fontsize=22)
plt.ylabel('PC3 - {0}%'.format(per_var[2]), fontsize=22)
plt.title('PCA components 2 & 3', fontsize=22)
for sample in pca_df.index:
plt.annotate(sample, (pca_df.PC2.loc[sample], pca_df.PC3.loc[sample]))
plt.savefig(os.path.join(options.out, "PCA_2_3.pdf"))
plt.close()
# -------------------------------------------------------------------------
# PCA 1st and 3rd component
# -------------------------------------------------------------------------
plt.figure()
plt.rcParams["figure.figsize"] = (20,20)
plt.scatter(pca_df.PC1, pca_df.PC3)
plt.xlabel('PC1 - {0}%'.format(per_var[0]), fontsize=22)
plt.ylabel('PC3 - {0}%'.format(per_var[2]), fontsize=22)
plt.title('PCA components 1 & 3', fontsize=22)
for sample in pca_df.index:
plt.annotate(sample, (pca_df.PC1.loc[sample], pca_df.PC3.loc[sample]))
plt.savefig(os.path.join(options.out, "PCA_1_3.pdf"))
plt.close()
# -------------------------------------------------------------------------
# PCA 3D
# -------------------------------------------------------------------------
labels = []
pc1 = []
pc2 = []
pc3 = []
for i, row in pca_df.iterrows():
labels.append(i)
pc1.append(row["PC1"])
pc2.append(row["PC2"])
pc3.append(row["PC3"])
fig = plt.figure(figsize=(20,20))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(pc1, pc2, pc3)
ax.set_xlabel('PC1 - {0}%'.format(per_var[0]))
ax.set_ylabel('PC2 - {0}%'.format(per_var[1]))
ax.set_zlabel('PC3 - {0}%'.format(per_var[2]))
for label, x, y, z in zip(labels, pc1, pc2, pc3):
ax.text(x, y, z, label)
plt.savefig(os.path.join(options.out, "PCA_3D.pdf"))
# -------------------------------------------------------------------------
# Write loading scores
# -------------------------------------------------------------------------
if options.verbose:
sys.stdout.write(f"Writing loading scores in: {options.out} {os.linesep}")
loading_scores = pd.Series(pca.components_[0], index=list(df.index))
sorted_loading_scores = loading_scores.abs().sort_values(ascending=False)
sorted_loading_scores.to_csv(os.path.join(options.out, "loading_scores.tsv"), header=False, sep="\t")
if options.verbose:
sys.stdout.write(f"Done {os.linesep}")
# _____________________________________________________________________________
# -----------------------------------------------------------------------------
# Call the Main function and catch Keyboard interrups
# -----------------------------------------------------------------------------
if __name__ == '__main__':
try:
main()
except KeyboardInterrupt:
sys.stderr.write("User interrupt!" + os.linesep)
sys.exit(0)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment