Skip to content
Snippets Groups Projects
Commit e8d43d8c authored by Selim Bouaouina's avatar Selim Bouaouina
Browse files

updated mutant matrix file and run_GSEA.R

parent b7c3f562
Branches
No related tags found
No related merge requests found
...@@ -15,7 +15,7 @@ kegg_to_t2g = function(data_set){ ...@@ -15,7 +15,7 @@ kegg_to_t2g = function(data_set){
GSEA_kegg_list <- list() GSEA_kegg_list <- list()
for (org_id in kegg_orgids){ for (org_id in kegg_orgids){
tmp_subset <- data_set[[org_id]] tmp_subset <- data_set[[org_id]]
tmp_subset$label_joined <- paste(tmp_subset$pathway_id, tmp_subset$pathway_name, sep="_") tmp_subset$label_joined <- paste(gsub("path:","",tmp_subset$pathway_id), gsub(" - Mycobacterium tuberculosis CDC1551| - Mycobacterium tuberculosis H37Rv","",tmp_subset$pathway_name), sep="_")
GSEA_kegg_list[[org_id]] <- tmp_subset[,c("label_joined","uniprot_id")] GSEA_kegg_list[[org_id]] <- tmp_subset[,c("label_joined","uniprot_id")]
} }
return(GSEA_kegg_list) return(GSEA_kegg_list)
...@@ -44,31 +44,32 @@ kegg_to_t2g = function(data_set){ ...@@ -44,31 +44,32 @@ kegg_to_t2g = function(data_set){
run_GSEA = function(data_set, kingdom, pvalCOff,MAIN_PATH, kegg_list,mycobrowser_data){ run_GSEA = function(data_set, kingdom, pvalCOff,MAIN_PATH, kegg_list,mycobrowser_data){
set.seed(1234)
if(!(kingdom %in% c("GO_CC","GO_BP","GO_MF", "KEGG", "MYCOBROWSER_FC", "MYCOBROWSER_FU", "MYCOBROWSER_PR"))){print("WARNING, choose kingdom you want to analyze. Options:\"GO_CC\",\"GO_BP\",\"GO_MF\", \"KEGG\", \"MYCOBROWSER_FC\", \"MYCOBROWSER_FU\", \"MYCOBROWSER_PR\".")} if(!(kingdom %in% c("GO_CC","GO_BP","GO_MF", "KEGG", "MYCOBROWSER_FC", "MYCOBROWSER_FU", "MYCOBROWSER_PR"))){print("WARNING, choose kingdom you want to analyze. Options:\"GO_CC\",\"GO_BP\",\"GO_MF\", \"KEGG\", \"MYCOBROWSER_FC\", \"MYCOBROWSER_FU\", \"MYCOBROWSER_PR\".")}
if(kingdom == "GO_CC"){ if(kingdom == "GO_CC"){
print("You are running GSEA on GeneOntology category \"Cellular Compartment\".") print("You are running GSEA on GeneOntology category \"Cellular Compartment\".")
load(paste(MAIN_PATH, "annotation_files", "cc_table_for_gmt.Rdata", sep="/")) # loads the data set: cc_table_for_gmt load(paste(MAIN_PATH, "annotation_files", "cc_table_for_gmt.Rdata", sep="/")) # loads the data set: cc_table_for_gmt
return(GSEA(rev(data_set), minGSSize = 1, maxGSSize = (Inf), eps = 0, pvalueCutoff = pvalCOff, pAdjustMethod = "BH", TERM2GENE=cc_table_for_gmt, by="fgsea")) #cutoff for adjusted pvalue return(GSEA(rev(data_set), minGSSize = 1, maxGSSize = (Inf), eps = 0, pvalueCutoff = pvalCOff, pAdjustMethod = "BH", TERM2GENE=cc_table_for_gmt, by="fgsea", seed=T)) #cutoff for adjusted pvalue
} }
if(kingdom == "GO_BP"){ if(kingdom == "GO_BP"){
print("You are running GSEA on GeneOntology category \"Biological Process\".") print("You are running GSEA on GeneOntology category \"Biological Process\".")
load(paste(MAIN_PATH, "annotation_files", "bp_table_for_gmt.Rdata", sep="/")) # loads the data set: bp_table_for_gmt load(paste(MAIN_PATH, "annotation_files", "bp_table_for_gmt.Rdata", sep="/")) # loads the data set: bp_table_for_gmt
return(GSEA(rev(data_set), minGSSize = 1, maxGSSize = (Inf), eps = 0, pvalueCutoff = pvalCOff, pAdjustMethod = "BH", TERM2GENE=bp_table_for_gmt, by="fgsea")) return(GSEA(rev(data_set), minGSSize = 1, maxGSSize = (Inf), eps = 0, pvalueCutoff = pvalCOff, pAdjustMethod = "BH", TERM2GENE=bp_table_for_gmt, by="fgsea", seed=T))
} }
if(kingdom == "GO_MF"){ if(kingdom == "GO_MF"){
print("You are running GSEA on GeneOntology category \"Molecular Function\".") print("You are running GSEA on GeneOntology category \"Molecular Function\".")
load(paste(MAIN_PATH, "annotation_files", "mf_table_for_gmt.Rdata", sep="/")) # loads the data set: mf_table_for_gmt load(paste(MAIN_PATH, "annotation_files", "mf_table_for_gmt.Rdata", sep="/")) # loads the data set: mf_table_for_gmt
return(GSEA(rev(data_set), minGSSize = 1, maxGSSize = (Inf), eps = 0, pvalueCutoff = pvalCOff, pAdjustMethod = "BH", TERM2GENE=mf_table_for_gmt, by="fgsea")) return(GSEA(rev(data_set), minGSSize = 1, maxGSSize = (Inf), eps = 0, pvalueCutoff = pvalCOff, pAdjustMethod = "BH", TERM2GENE=mf_table_for_gmt, by="fgsea", seed=T))
} }
if(kingdom == "KEGG"){ if(kingdom == "KEGG"){
print("You are running GSEA on kegg pathways. The code runs for every Mtb organism ID on KEGG separately, so the output will be a list, not a table as when running the function for GO terms.") print("You are running GSEA on kegg pathways. The code runs for every Mtb organism ID on KEGG separately, so the output will be a list, not a table as when running the function for GO terms. Some KEGG organism IDs have contain different protein identifiers, not recognized by the GSEA Function. This could be corrected, but because they are highly redundant, this is not done here. Most important are results from organism IDs: mtu & mtv, which both are Mtb H37Rv.")
GSEA_kegg_list = kegg_to_t2g(kegg_list) GSEA_kegg_list = kegg_to_t2g(kegg_list)
GSEA_kegg_result_list <- list() GSEA_kegg_result_list <- list()
kegg_orgids <- names(GSEA_kegg_list) kegg_orgids <- names(GSEA_kegg_list)
for (org_id in kegg_orgids){ for (org_id in kegg_orgids){
message(paste("Running GSEA on comparison_",org_id," right now. Computing...")) message(paste("Running GSEA on comparison_",org_id," right now. Computing..."))
runfun <- tryCatch( runfun <- tryCatch(
res_GSEA <- GSEA(rev(data_set), minGSSize = 1, maxGSSize = (Inf), eps = 0, pvalueCutoff = pvalCOff, pAdjustMethod = "BH", TERM2GENE=GSEA_kegg_list[[org_id]], by="fgsea"), res_GSEA <- GSEA(rev(data_set), minGSSize = 1, maxGSSize = (Inf), eps = 0, pvalueCutoff = pvalCOff, pAdjustMethod = "BH", TERM2GENE=GSEA_kegg_list[[org_id]], by="fgsea", seed=T),
error = function(e) { error = function(e) {
message(paste("Running GSEA on the comparison: ",org_id," ended up in error...And Now for Something Completely Different.")) message(paste("Running GSEA on the comparison: ",org_id," ended up in error...And Now for Something Completely Different."))
NA NA
...@@ -81,17 +82,17 @@ run_GSEA = function(data_set, kingdom, pvalCOff,MAIN_PATH, kegg_list,mycobrowser ...@@ -81,17 +82,17 @@ run_GSEA = function(data_set, kingdom, pvalCOff,MAIN_PATH, kegg_list,mycobrowser
if(kingdom == "MYCOBROWSER_FC"){ if(kingdom == "MYCOBROWSER_FC"){
print("You are running GSEA on mycobrowser category \"Functional Category\".") print("You are running GSEA on mycobrowser category \"Functional Category\".")
mycobrowser_FunCat <- unique(mycobrowser_data[,c("Functional_Category","UniProt_AC")]) mycobrowser_FunCat <- unique(mycobrowser_data[,c("Functional_Category","UniProt_AC")])
return(GSEA(rev(data_set), minGSSize = 1, maxGSSize = (Inf), eps = 0, pvalueCutoff = pvalCOff, pAdjustMethod = "BH", TERM2GENE=mycobrowser_FunCat, by="fgsea")) return(GSEA(rev(data_set), minGSSize = 1, maxGSSize = (Inf), eps = 0, pvalueCutoff = pvalCOff, pAdjustMethod = "BH", TERM2GENE=mycobrowser_FunCat, by="fgsea", seed=T))
} }
if(kingdom == "MYCOBROWSER_FU"){ if(kingdom == "MYCOBROWSER_FU"){
print("You are running GSEA on mycobrowser category \"Function\".") print("You are running GSEA on mycobrowser category \"Function\".")
mycobrowser_Function <- unique(mycobrowser_data[,c("Function","UniProt_AC")]) mycobrowser_Function <- unique(mycobrowser_data[,c("Function","UniProt_AC")])
return(GSEA(rev(data_set), minGSSize = 1, maxGSSize = (Inf), eps = 0, pvalueCutoff = pvalCOff, pAdjustMethod = "BH", TERM2GENE=mycobrowser_Function, by="fgsea")) return(GSEA(rev(data_set), minGSSize = 1, maxGSSize = (Inf), eps = 0, pvalueCutoff = pvalCOff, pAdjustMethod = "BH", TERM2GENE=mycobrowser_Function, by="fgsea", seed=T))
} }
if(kingdom == "MYCOBROWSER_PR"){ if(kingdom == "MYCOBROWSER_PR"){
print("You are running GSEA on mycobrowser category \"Product\".") print("You are running GSEA on mycobrowser category \"Product\".")
mycobrowser_Product <- unique(mycobrowser_data[,c("Product","UniProt_AC")]) mycobrowser_Product <- unique(mycobrowser_data[,c("Product","UniProt_AC")])
return(GSEA(rev(data_set), minGSSize = 1, maxGSSize = (Inf), eps = 0, pvalueCutoff = pvalCOff, pAdjustMethod = "BH", TERM2GENE=mycobrowser_Product, by="fgsea")) return(GSEA(rev(data_set), minGSSize = 1, maxGSSize = (Inf), eps = 0, pvalueCutoff = pvalCOff, pAdjustMethod = "BH", TERM2GENE=mycobrowser_Product, by="fgsea", seed=T))
} }
} }
......
...@@ -25,6 +25,7 @@ run_QC = function(input_data,qc_level){ ...@@ -25,6 +25,7 @@ run_QC = function(input_data,qc_level){
input_data$norm_quantity <- 2^(input_data$normalised_intensity_log2) input_data$norm_quantity <- 2^(input_data$normalised_intensity_log2)
# 4.3 CVs # 4.3 CVs
print(qc_cvs(data = input_data, grouping = pep_grouping_key, condition = r_condition, intensity = pep_quantity, plot = TRUE, plot_style = "violin")) print(qc_cvs(data = input_data, grouping = pep_grouping_key, condition = r_condition, intensity = pep_quantity, plot = TRUE, plot_style = "violin"))
print("Plot below shows CVs post normalization by 'pep_quantity'.")
print(qc_cvs(data = input_data, grouping = pep_grouping_key, condition = r_condition, intensity = norm_quantity, plot = TRUE, plot_style = "violin")) print(qc_cvs(data = input_data, grouping = pep_grouping_key, condition = r_condition, intensity = norm_quantity, plot = TRUE, plot_style = "violin"))
# 4.4 PCA qc_pca( data = input_data, sample = r_file_name, grouping = pep_grouping_key, intensity = normalised_intensity_log2, condition = r_condition, digestion = NULL, plot_style = "scree" ) # 4.4 PCA qc_pca( data = input_data, sample = r_file_name, grouping = pep_grouping_key, intensity = normalised_intensity_log2, condition = r_condition, digestion = NULL, plot_style = "scree" )
print(qc_pca(data = input_data, sample = r_file_name, grouping = pep_grouping_key, intensity = normalised_intensity_log2, condition = r_condition, components= c("PC1","PC2"), plot_style = "pca" ) ) print(qc_pca(data = input_data, sample = r_file_name, grouping = pep_grouping_key, intensity = normalised_intensity_log2, condition = r_condition, components= c("PC1","PC2"), plot_style = "pca" ) )
...@@ -35,6 +36,7 @@ run_QC = function(input_data,qc_level){ ...@@ -35,6 +36,7 @@ run_QC = function(input_data,qc_level){
print(qc_ids(data = input_data, sample = r_file_name, grouping = pep_grouping_key, intensity = norm_quantity, condition = r_condition, title = "Peptide identifications per sample", plot = TRUE ) ) print(qc_ids(data = input_data, sample = r_file_name, grouping = pep_grouping_key, intensity = norm_quantity, condition = r_condition, title = "Peptide identifications per sample", plot = TRUE ) )
# 4.7 intensity distribution # 4.7 intensity distribution
print(qc_intensity_distribution(data = input_data, sample = r_file_name, grouping = pep_grouping_key, intensity_log2 = transformed_quantity_log2 , plot_style = "boxplot") ) print(qc_intensity_distribution(data = input_data, sample = r_file_name, grouping = pep_grouping_key, intensity_log2 = transformed_quantity_log2 , plot_style = "boxplot") )
print("Plot below shows intensity distribution post normalization by 'pep_quantity'.")
print(qc_intensity_distribution(data = input_data, sample = r_file_name, grouping = pep_grouping_key, intensity_log2 = normalised_intensity_log2, plot_style = "boxplot")) print(qc_intensity_distribution(data = input_data, sample = r_file_name, grouping = pep_grouping_key, intensity_log2 = normalised_intensity_log2, plot_style = "boxplot"))
# 4.8 data completeness # 4.8 data completeness
print(qc_data_completeness(data = input_data, sample = r_file_name, grouping = pep_grouping_key, intensity = normalised_intensity_log2, plot = TRUE) ) print(qc_data_completeness(data = input_data, sample = r_file_name, grouping = pep_grouping_key, intensity = normalised_intensity_log2, plot = TRUE) )
...@@ -48,6 +50,7 @@ run_QC = function(input_data,qc_level){ ...@@ -48,6 +50,7 @@ run_QC = function(input_data,qc_level){
input_data$norm_quantity <- 2^(input_data$normalised_intensity_log2) input_data$norm_quantity <- 2^(input_data$normalised_intensity_log2)
# 4.3 CVs # 4.3 CVs
print(qc_cvs(data = input_data, grouping = pg_protein_groups, condition = r_condition, intensity = pg_quantity, plot = TRUE, plot_style = "violin")) print(qc_cvs(data = input_data, grouping = pg_protein_groups, condition = r_condition, intensity = pg_quantity, plot = TRUE, plot_style = "violin"))
print("Plot below shows CVs post normalization by 'pg_quantity'.")
print(qc_cvs(data = input_data, grouping = pg_protein_groups, condition = r_condition, intensity = norm_quantity, plot = TRUE, plot_style = "violin")) print(qc_cvs(data = input_data, grouping = pg_protein_groups, condition = r_condition, intensity = norm_quantity, plot = TRUE, plot_style = "violin"))
# 4.4 PCA qc_pca( data = input_data, sample = r_file_name, grouping = pg_protein_groups, intensity = normalised_intensity_log2, condition = r_condition, digestion = NULL, plot_style = "scree" ) # 4.4 PCA qc_pca( data = input_data, sample = r_file_name, grouping = pg_protein_groups, intensity = normalised_intensity_log2, condition = r_condition, digestion = NULL, plot_style = "scree" )
print(qc_pca(data = input_data, sample = r_file_name, grouping = pg_protein_groups, intensity = normalised_intensity_log2, condition = r_condition, components= c("PC1","PC2"), plot_style = "pca" ) ) print(qc_pca(data = input_data, sample = r_file_name, grouping = pg_protein_groups, intensity = normalised_intensity_log2, condition = r_condition, components= c("PC1","PC2"), plot_style = "pca" ) )
...@@ -58,6 +61,7 @@ run_QC = function(input_data,qc_level){ ...@@ -58,6 +61,7 @@ run_QC = function(input_data,qc_level){
print(qc_ids(data = input_data, sample = r_file_name, grouping = pg_protein_groups, intensity = norm_quantity, condition = r_condition, title = "Peptide identifications per sample", plot = TRUE ) ) print(qc_ids(data = input_data, sample = r_file_name, grouping = pg_protein_groups, intensity = norm_quantity, condition = r_condition, title = "Peptide identifications per sample", plot = TRUE ) )
# 4.7 intensity distribution # 4.7 intensity distribution
print(qc_intensity_distribution(data = input_data, sample = r_file_name, grouping = pg_protein_groups, intensity_log2 = transformed_quantity_log2 , plot_style = "boxplot") ) print(qc_intensity_distribution(data = input_data, sample = r_file_name, grouping = pg_protein_groups, intensity_log2 = transformed_quantity_log2 , plot_style = "boxplot") )
print("Plot below shows intensity distribution post normalization by 'pg_quantity'.")
print(qc_intensity_distribution(data = input_data, sample = r_file_name, grouping = pg_protein_groups, intensity_log2 = normalised_intensity_log2, plot_style = "boxplot") ) print(qc_intensity_distribution(data = input_data, sample = r_file_name, grouping = pg_protein_groups, intensity_log2 = normalised_intensity_log2, plot_style = "boxplot") )
# 4.8 data completeness # 4.8 data completeness
print(qc_data_completeness(data = input_data, sample = r_file_name, grouping = pg_protein_groups, intensity = normalised_intensity_log2, plot = TRUE) ) print(qc_data_completeness(data = input_data, sample = r_file_name, grouping = pg_protein_groups, intensity = normalised_intensity_log2, plot = TRUE) )
......
...@@ -9,8 +9,8 @@ N2030,N0157,L1,HF,L1_HF,rpoB,S450L,na,1 ...@@ -9,8 +9,8 @@ N2030,N0157,L1,HF,L1_HF,rpoB,S450L,na,1
N2031,N0157,L1,LF,L1_LF,rpoB,H445R,na,1 N2031,N0157,L1,LF,L1_LF,rpoB,H445R,na,1
N1888,N0145,L2,HF,L2_HF,rpoB,S450L,na,1 N1888,N0145,L2,HF,L2_HF,rpoB,S450L,na,1
N1890,N0145,L2,LF,L2_LF,rpoB,H445R,na,1 N1890,N0145,L2,LF,L2_LF,rpoB,H445R,na,1
N2502,N1283,L4,HF,L4_HF,rpoB,S450L,na,1 N2502,N1283,L4,LF,L4_LF,rpoB,H445R,na,1
N2850,N1283,L4,LF,L4_LF,rpoB,H445R,na,1 N2850,N1283,L4,HF,L4_HF,rpoB,S450L,na,1
N0157,N0157,L1,WT,L1_WT,WT,WT,1,0 N0157,N0157,L1,WT,L1_WT,WT,WT,1,0
N0145,N0145,L2,WT,L2_WT,WT,WT,1,0 N0145,N0145,L2,WT,L2_WT,WT,WT,1,0
N1283,N1283,L4,WT,L4_WT,WT,WT,1,0 N1283,N1283,L4,WT,L4_WT,WT,WT,1,0
\ No newline at end of file
...@@ -12,9 +12,7 @@ knitr::opts_chunk$set(echo = TRUE, ...@@ -12,9 +12,7 @@ knitr::opts_chunk$set(echo = TRUE,
message = TRUE) message = TRUE)
``` ```
# Proteome analysis pipeline ### How to run the pipeline
### Main goal
Run the pipeline with the following code. Example for comparison of WT to S450L. Run the pipeline with the following code. Example for comparison of WT to S450L.
```{bash main_goal, eval=FALSE} ```{bash main_goal, eval=FALSE}
sbatch --job-name JOB_NAME SLURM_SCRIPT.sh /sciCore/MAIN/PATH OUTPUTFILE_NAME EXPERIMENT_NAME ANNOTATION_FILE.csv sbatch --job-name JOB_NAME SLURM_SCRIPT.sh /sciCore/MAIN/PATH OUTPUTFILE_NAME EXPERIMENT_NAME ANNOTATION_FILE.csv
...@@ -22,77 +20,59 @@ e.g.: ...@@ -22,77 +20,59 @@ e.g.:
sbatch --job-name tbeeprotpip_WT_vs_S450L tbeeprotpip_slurm_script.sh /scicore/home/gagneux/bousel00/tb/TbX015 WT_vs_S450L_output TbX015 annotation_file_wt_vs_s450l.csv sbatch --job-name tbeeprotpip_WT_vs_S450L tbeeprotpip_slurm_script.sh /scicore/home/gagneux/bousel00/tb/TbX015 WT_vs_S450L_output TbX015 annotation_file_wt_vs_s450l.csv
``` ```
### Split the work in four parts ### Analysis pipeline
* Load, filter and merge data * Load, filter and merge data
* QC peptide level * QC peptide level
* QC protein level * QC protein level
* Differential expression analysis * Differential expression analysis
- Comparison level(s), defined by the composition of the annotation file - Comparison level(s) are defined by the composition of the annotation file
* Enrichment Analysis * Enrichment Analysis
- Annotations - Annotations
- Gene Ontology - Gene Ontology
- MF - Molecular Function
- CC - Cellular Compartment
- BP - Biological Process
- KEGG Pathways - KEGG Pathways
- Mycobrowser - Mycobrowser
- FC - Functional Category
- Function - Function
- Product - Product
### Thoughts and Comments TODO: providing the extraction-OD data, I can also further test for correlation between CV and variability between extraction-ODs of the strain, and add this to the QC [to_do]
From <https://raps-with-r.dev/project_rewrite.html> I learn that I can shorten my script a lot and make it more easy readable and executable.
There the author also suggests to work with the {target} R package, which is also introduced here <https://books.ropensci.org/targets/walkthrough.html>. # Load all packages
```{r loading_the_packages, message=FALSE, warning=FALSE}
More concrete: library(protti)
* work with git, to learn it, and because it will be safer in case of messing up [check] library(magrittr)
* make separate R-scripts: [check] library(dplyr) # from tidyverse
- where you define all functions. source this script in the code [check] library(tidyr) # from tidyverse
- where you load, filter and merge the data [check] library(ggplot2) # from tidyverse
- QC peptide-level (can maybe write a function for QC and then decide the level in function-input) [check] library(purrr) # from tidyverse
- QC protein-level (can maybe write a function for QC and then decide the level in function-input) [check] library(gplots)
- Enrichment Analysis [check] library(ggpubr)
* modify for loops into apply functions, to make the script more robust (examples on <https://raps-with-r.dev/project_rewrite.html>) [no] library(VennDiagram)
* for the script to run faster and to use less memory, make sure you unload all data-sets as soon as they are not needed in the work environment anymore (can check all loaded objects with {ls()}) [to_do] library(ggvenn)
* make the script write it's own report with comments, plots and tables. [check] library(KEGGREST)
* try to make it work with target, so independent jobs can run in parallel and you can run the script on multiple cores in parallel (e.g. the comparison-levels) [to look into] library(Peptides)
* provide a folder in the git-folder, where you have all annotation files [check] library(stringr)
* providing the extraction-OD data, I can also further test for correlation between CV and variability between extraction-ODs of the strain, and add this to the QC [to_do] library(fgsea)
* write in report what package versions were used and what annotation data files (as they contain the date) [to_do] library(ggnewscale)
* maybe it helps when reading in the mutant matrix, to make a list of it, for easier access of elements? [no] library(clusterProfiler)
library(pathview)
# load all packages library(enrichplot)
```{r loading_the_packages} library(biomaRt)
library("protti") library(ggrepel)
library("magrittr") library(msigdbr)
library("dplyr") # from tidyverse library(ggridges)
library("tidyr") # from tidyverse library(lattice) #to print pdfs from loops
library("ggplot2") # from tidyverse #library(Cairo)
library("purrr") # from tidyverse #library(png)
library("gplots")
library("ggpubr")
library("VennDiagram")
library("ggvenn")
library("KEGGREST")
library("Peptides")
library("stringr")
library("fgsea")
library("ggnewscale")
library("clusterProfiler")
library("pathview")
library("enrichplot")
library("biomaRt")
library("ggrepel")
library("msigdbr")
library("ggridges")
library("lattice") #to print pdfs from loops
``` ```
## Load file paths ## Load file paths
The file paths are loaded from the command line input. The file paths are loaded from the command line input.
The variables are put in on the comand line, the params argument from the rmarkdown::render() function passes the values into the Rscript and they can here be accessed via params$variableINeed. The variables are put in on the comand line, the params argument from the rmarkdown::render() function passes the values into the Rscript and they can here be accessed via params$variableINeed.
```{r loading_the_paths} ```{r loading_the_paths, echo = FALSE}
#MAIN_PATH = "/scicore/home/gagneux/bousel00/tb/TbX015" #MAIN_PATH = "/scicore/home/gagneux/bousel00/tb/TbX015"
MAIN_PATH = params$MAIN_PATH MAIN_PATH = params$MAIN_PATH
...@@ -110,20 +90,28 @@ mutant_matrix = system.file("annotation_files", "PROTEOME_PIPELINE_MUTANTMATRIX. ...@@ -110,20 +90,28 @@ mutant_matrix = system.file("annotation_files", "PROTEOME_PIPELINE_MUTANTMATRIX.
if (mutant_matrix == "") { if (mutant_matrix == "") {
stop(paste("Could not find `PROTEOME_PIPELINE_MUTANTMATRIX.txt`. Try re-installing `tbeeprotpip`."), call. = FALSE) stop(paste("Could not find `PROTEOME_PIPELINE_MUTANTMATRIX.txt`. Try re-installing `tbeeprotpip`."), call. = FALSE)
} }
print(paste("MAIN_PATH:", MAIN_PATH))
print(paste("EXPERIMENT:", EXPERIMENT))
print(paste("ms_annotation_file:", ms_annotation_file))
print(paste("mutant_matrix:", mutant_matrix))
``` ```
## Load the data ## Load the data
Here you load the Spectronaut output and the annotation data. Here you load the Spectronaut output and the annotation data.
```{r load_data_spec.R} ```{r load_data_spec.R, include=FALSE}
base_spectronaut_data <- read_protti(filename = spectronaut_xls_file, sep="\t") base_spectronaut_data <- read_protti(filename = spectronaut_xls_file, sep="\t")
``` ```
```{r load_data_annot.R} ```{r load_data_annot.R, echo=FALSE}
#The annotation data contains only information about the samples and conditions of interest. #The annotation data contains only information about the samples and conditions of interest.
annot <- read.csv(ms_annotation_file, header = TRUE, sep=";") annot <- read.csv(ms_annotation_file, header = TRUE, sep=";")
# Display the data frame using kable
knitr::kable(annot)
``` ```
```{r load_data_mutantmat.R} ```{r load_data_mutantmat.R, echo=FALSE}
# load the extraction-OD data # load the extraction-OD data
...@@ -131,20 +119,22 @@ annot <- read.csv(ms_annotation_file, header = TRUE, sep=";") ...@@ -131,20 +119,22 @@ annot <- read.csv(ms_annotation_file, header = TRUE, sep=";")
# load the mutant matrix information # load the mutant matrix information
mutantmatrix_data <- read.csv(mutant_matrix, sep=",", header=T) mutantmatrix_data <- read.csv(mutant_matrix, sep=",", header=T)
# Display the data frame using kable
knitr::kable(mutantmatrix_data)
``` ```
The function 'latest_file' will load the latest version of a file with a defined name and the date it was generated. The function 'latest_file' will load the latest version of a file with a defined name and the date it was generated.
```{r load_data_uniprpt.R} ```{r load_data_uniprpt.R, include=FALSE}
# load Uniprot annotation (excluding GO terms) | see script fetch_uniprot_separate_script.r # load Uniprot annotation (excluding GO terms) | see script fetch_uniprot_separate_script.r
load(latest_file(MAIN_PATH,"uniprot")) load(latest_file(MAIN_PATH,"uniprot"))
``` ```
```{r load_data_mycobrow.R} ```{r load_data_mycobrow.R, include=FALSE}
# load Mycobrowser annotation | downloaded from mycobrowser website on 24. May 2022 # load Mycobrowser annotation | downloaded from mycobrowser website on 24. May 2022
mycobrowser_data <- read.csv("/scicore/home/gagneux/bousel00/tb/TbX015/annotation_files/mycobrowser_Mycobacterium_tuberculosis_H37Rv_txt_v4.txt", header=T, sep="\t") mycobrowser_data <- read.csv("/scicore/home/gagneux/bousel00/tb/TbX015/annotation_files/mycobrowser_Mycobacterium_tuberculosis_H37Rv_txt_v4.txt", header=T, sep="\t")
``` ```
```{r load_data_kegg.R} ```{r load_data_kegg.R, include=FALSE}
# load KEGG annotation # load KEGG annotation
load(latest_file(MAIN_PATH,"kegg")) # loads the data set: kegg_list load(latest_file(MAIN_PATH,"kegg")) # loads the data set: kegg_list
...@@ -155,8 +145,8 @@ load(latest_file(MAIN_PATH,"kegg")) # loads the data set: kegg_list ...@@ -155,8 +145,8 @@ load(latest_file(MAIN_PATH,"kegg")) # loads the data set: kegg_list
#load(paste(MAIN_PATH, "annotation_files", "GSEA_kegg_list.Rdata", sep="/")) # loads the data set: GSEA_kegg_list #load(paste(MAIN_PATH, "annotation_files", "GSEA_kegg_list.Rdata", sep="/")) # loads the data set: GSEA_kegg_list
``` ```
TO DO: think about what to do with the QuickGO data...and where the uniprot GO data stems from. TODO: check again if the QuickGO data is preferable to the uniprot GO (and how it is downloaded)
```{r load_data_qg.R} ```{r load_data_qg.R, eval=FALSE}
# load QuickGO annotation (all three domains) # load QuickGO annotation (all three domains)
quick_go_data <- read.csv("/scicore/home/gagneux/bousel00/tb/TbX015/annotation_files/QuickGO_annotations_MTBC_1773_ReviewedAndUnreviewed_20220722.tsv", header=T, sep="\t") quick_go_data <- read.csv("/scicore/home/gagneux/bousel00/tb/TbX015/annotation_files/QuickGO_annotations_MTBC_1773_ReviewedAndUnreviewed_20220722.tsv", header=T, sep="\t")
``` ```
...@@ -171,18 +161,22 @@ go_mf_gmt = make_gmt(uniprot, "go_molecular_function") # uses my defined functio ...@@ -171,18 +161,22 @@ go_mf_gmt = make_gmt(uniprot, "go_molecular_function") # uses my defined functio
``` ```
## Load data for GSEA ## Load data for GSEA
TO DO: These data sets remain to be generated from the gmt file...WHY DID I WRITE THIS DOWN? CHECK! TODO: These data sets remain to be generated from the gmt file...WHY DID I WRITE THIS DOWN? CHECK!
```{r load_data_for_GSEA} ```{r load_data_for_GSEA, echo=FALSE}
load(paste(MAIN_PATH, "annotation_files", "cc_table_for_gmt.Rdata", sep="/")) # loads the data set: cc_table_for_gmt load(paste(MAIN_PATH, "annotation_files", "cc_table_for_gmt.Rdata", sep="/")) # loads the data set: cc_table_for_gmt
load(paste(MAIN_PATH, "annotation_files", "bp_table_for_gmt.Rdata", sep="/")) # loads the data set: bp_table_for_gmt load(paste(MAIN_PATH, "annotation_files", "bp_table_for_gmt.Rdata", sep="/")) # loads the data set: bp_table_for_gmt
load(paste(MAIN_PATH, "annotation_files", "mf_table_for_gmt.Rdata", sep="/")) # loads the data set: mf_table_for_gmt load(paste(MAIN_PATH, "annotation_files", "mf_table_for_gmt.Rdata", sep="/")) # loads the data set: mf_table_for_gmt
print(paste(MAIN_PATH, "annotation_files", "cc_table_for_gmt.Rdata", sep="/"))
print(paste(MAIN_PATH, "annotation_files", "bp_table_for_gmt.Rdata", sep="/"))
print(paste(MAIN_PATH, "annotation_files", "mf_table_for_gmt.Rdata", sep="/"))
``` ```
## Filtering the data for Quality Control (QC) ## Filtering the data for Quality Control (QC)
Here I remove the "r_replicate" column, because it will be added later on with the proper replicate numbers. Here I remove the "r_replicate" column, because it will be added later on with the proper replicate numbers.
Then I remove the base_spectronaut_data from the environment to save RAM. Then I remove the base_spectronaut_data from the environment to save RAM.
Then I remove all decoy peptides and reshuffle the annot file. Then I remove all decoy peptides and reshuffle the columns of the annot file, for working with it.
```{r filtering_for_QC.R} ```{r filtering_for_QC.R, include=FALSE}
base_spectronaut_data_reduced <- subset(base_spectronaut_data, select=-c(r_replicate)) base_spectronaut_data_reduced <- subset(base_spectronaut_data, select=-c(r_replicate))
rm(base_spectronaut_data) rm(base_spectronaut_data)
...@@ -201,8 +195,8 @@ annot$r_replicate <- c(1:dim(annot)[1]) ...@@ -201,8 +195,8 @@ annot$r_replicate <- c(1:dim(annot)[1])
``` ```
## Annotate Spectronaut-Data ## Annotate Spectronaut-Data
Here I annotate the data. As each line in "base_spectronaut_data_reduced" matches at most one line in y, choose relationship = "many-to-one". Then filter the data, so only the rows of interest are kept, menaing only the ones which got an entry from the merging with the annotation data. Then remove the two old data sets, to not overload the environment memory Here I annotate the data. As each line in "base_spectronaut_data_reduced" matches at most one line in y, choose relationship = "many-to-one". Then filter the data, so only the rows of interest are kept, meaning only the ones which got an entry from the merging with the annotation data. Then remove the two old data sets, to sve RAM.
```{r annotating_with_MS_annotation.R} ```{r annotating_with_MS_annotation.R, include=FALSE}
spectronaut_data <- base_spectronaut_data_reduced %>% spectronaut_data <- base_spectronaut_data_reduced %>%
full_join(annot, by="r_file_name", relationship = "many-to-one") %>% full_join(annot, by="r_file_name", relationship = "many-to-one") %>%
# filter rows where "Condition" is empty # filter rows where "Condition" is empty
...@@ -217,28 +211,28 @@ rm(base_spectronaut_data_reduced, annot) ...@@ -217,28 +211,28 @@ rm(base_spectronaut_data_reduced, annot)
## Quality Control- Peptide Level ## Quality Control- Peptide Level
Here I conduct quality control of the data. First on peptide level. Some plots are shown twice, because they first show the un-normalized data, then the normalized. Here I conduct quality control of the data. First on peptide level. Some plots are shown twice, because they first show the un-normalized data, then the normalized.
```{r pepQC.R} ```{r pepQC.R, echo=FALSE}
strwrap("First QualityControl is run on peptide level.",prefix = "\n", initial = "") strwrap("First QualityControl is run on peptide level.",prefix = "\n", initial = "")
run_QC(spectronaut_data, "peptide") run_QC(spectronaut_data, "peptide")
``` ```
## Quality Control- Protein Level ## Quality Control- Protein Level
Here I conduct quality control of the data. Now on protein level. Some plots are shown twice, because they first show the un-normalized data, then the normalized. Here I conduct quality control of the data. Now on protein level. Some plots are shown twice, because they first show the un-normalized data, then the normalized.
```{r protQC.R} ```{r protQC.R, echo=FALSE}
strwrap("Second QualityControl is run on protein level.",prefix = "\n", initial = "") strwrap("Second QualityControl is run on protein level.",prefix = "\n", initial = "")
run_QC(spectronaut_data, "protein") run_QC(spectronaut_data, "protein")
``` ```
## Filter the data for Expression Analysis ## Filter the data for Expression Analysis
Remove the non-proteotypic peptides. Remove the non-proteotypic peptides.
```{r filtering_for_EA.R} ```{r filtering_for_EA.R, include=FALSE}
data_filtered_proteotypic <- spectronaut_data %>% data_filtered_proteotypic <- spectronaut_data %>%
filter(pep_is_proteotypic == TRUE) filter(pep_is_proteotypic == TRUE)
``` ```
## Calculate the sequence coverage ## Calculate the sequence coverage
Here-fore, first annotate the data with the mutant matrix, which contains information needed for peptide type and sequence coverage. Here-fore, first annotate the data with the mutant matrix, which contains information needed for peptide type and sequence coverage.
```{r calculate_sequence_coverage.R} ```{r calculate_sequence_coverage.R, echo=FALSE}
data_filtered_uniprot <- data_filtered_proteotypic %>% data_filtered_uniprot <- data_filtered_proteotypic %>%
left_join(y = uniprot, left_join(y = uniprot,
by = "pg_protein_accessions") %>% by = "pg_protein_accessions") %>%
...@@ -263,7 +257,7 @@ qc_sequence_coverage( ...@@ -263,7 +257,7 @@ qc_sequence_coverage(
## Calculate missingness and differencial abundance of proteins ## Calculate missingness and differencial abundance of proteins
These functions from protti calculate the missingness based on adjustable thresholds and calculate the differential abundance of proteins. Furthermore a new column is added containing logicals if protein is significantly DE or not. These functions from protti calculate the missingness based on adjustable thresholds and calculate the differential abundance of proteins. Furthermore a new column is added containing logicals if protein is significantly DE or not.
To do so, the protein intensities need to be normalized first. This is already done in the QC, but not transferred from there to the global environment. To do so, the protein intensities need to be normalized first. This is already done in the QC, but not transferred from there to the global environment.
```{r calculate_missingness_and_DE.R} ```{r calculate_missingness_and_DE.R, echo=FALSE}
data_filtered_uniprot$transformed_quantity_log2 = log2(data_filtered_uniprot$pg_quantity) data_filtered_uniprot$transformed_quantity_log2 = log2(data_filtered_uniprot$pg_quantity)
data_filtered_uniprot <- normalise(data_filtered_uniprot, sample = r_file_name, intensity_log2 = transformed_quantity_log2, method = "median") # this adds the column "normalised_intensity_log2" data_filtered_uniprot <- normalise(data_filtered_uniprot, sample = r_file_name, intensity_log2 = transformed_quantity_log2, method = "median") # this adds the column "normalised_intensity_log2"
data_filtered_uniprot$norm_quantity <- 2^(data_filtered_uniprot$normalised_intensity_log2) data_filtered_uniprot$norm_quantity <- 2^(data_filtered_uniprot$normalised_intensity_log2)
...@@ -272,19 +266,49 @@ diff_abundance_data = calculate_missingness_and_DE(data_filtered_uniprot, 0.7, 0 ...@@ -272,19 +266,49 @@ diff_abundance_data = calculate_missingness_and_DE(data_filtered_uniprot, 0.7, 0
diff_abundance_significant <- diff_abundance_data %>% diff_abundance_significant <- diff_abundance_data %>%
mutate(is_significant = ifelse((adj_pval < 0.01), TRUE, FALSE)) mutate(is_significant = ifelse((adj_pval < 0.01), TRUE, FALSE))
diff_abundance_significant_file_name = rev(unlist(strsplit(ms_annotation_file, "/")))[1] %>%
gsub("annotation_file_","",.) %>%
gsub(".csv","",.) %>%
paste(.,"_GSEA_diff_abundance_significant.tsv", sep="")
diff_abundance_significant_export_subset_file_name = rev(unlist(strsplit(ms_annotation_file, "/")))[1] %>%
gsub("annotation_file_","",.) %>%
gsub(".csv","",.) %>%
paste(.,"_GSEA_diff_abundance_subset_significant.tsv", sep="")
# diff_abundance_significant_export_subset = unique(diff_abundance_significant[diff_abundance_significant$is_significant==TRUE,c("r_file_name","r_condition","r_replicate","pg_protein_groups","pg_protein_accessions","pg_quantity","SampleDescription","COI","n_number","length","mass","go_biological_process","go_cellular_component","go_molecular_function","go_id","cognate_wt","lin_class","mut_gene","aa_substitution","transformed_quantity_log2","norm_quantity","comparison","missingness","pval","adj_pval","diff","t_statistic","std_error","df","avg_abundance","n_obs","is_significant")])
# take also non-significant ones, so can draw the plot
diff_abundance_significant_export_subset = unique(diff_abundance_significant[,c("r_file_name","r_condition","r_replicate","pg_protein_groups","pg_protein_accessions","pg_quantity","SampleDescription","COI","n_number","length","mass","go_biological_process","go_cellular_component","go_molecular_function","go_id","cognate_wt","lin_class","mut_gene","aa_substitution","transformed_quantity_log2","norm_quantity","comparison","missingness","pval","adj_pval","diff","t_statistic","std_error","df","avg_abundance","n_obs","is_significant")])
diff_abundance_significant_print_data = unique(diff_abundance_significant[diff_abundance_significant$is_significant==TRUE,c("n_number","pg_protein_groups","COI","aa_substitution","norm_quantity","comparison","adj_pval")])
diff_abundance_significant_print_data$tmp_data = paste(diff_abundance_significant_print_data$COI, diff_abundance_significant_print_data$pg_protein_groups, sep="_")
means_table = aggregate(diff_abundance_significant_print_data$norm_quantity, list(diff_abundance_significant_print_data$tmp_data), FUN=mean)
names(means_table) = c("tmp_data", "mean_norm_quantity")
diff_abundance_significant_print_data = diff_abundance_significant_print_data[,-c(1,4,5)] %>%
right_join(means_table, by="tmp_data")
# Display the data frame using kable
knitr::kable(unique(diff_abundance_significant_print_data[,-5]))
#write.table(data.frame(diff_abundance_significant), paste(MAIN_PATH, "output_files",diff_abundance_significant_file_name, sep="/"), quote = F, col.names = T, row.names = F, sep = "|")
write.table(data.frame(diff_abundance_significant_export_subset), paste(MAIN_PATH, "output_files",diff_abundance_significant_export_subset_file_name, sep="/"), quote = F, col.names = T, row.names = F, sep = "|")
``` ```
## Subset the data for GSEA ## Subset the data for GSEA
Here the diff_abundance_data is subseted for only columns necessary for GSEA. Here the diff_abundance_data is subseted for only columns necessary for GSEA.
```{r subset_for_GSEA.R} ```{r subset_for_GSEA.R, include=FALSE}
diff_abundance_data_DEsubset <- unique(diff_abundance_data[c("pg_protein_groups","comparison","diff","t_statistic","adj_pval")]) diff_abundance_data_DEsubset <- unique(diff_abundance_data[c("pg_protein_groups","comparison","diff","t_statistic","adj_pval")])
``` ```
## Calculate and plot the p-value distribution ## Calculate and plot the p-value distribution
This can already tell something about the data. This can already tell something about the data.
The non-adjusted p-values are plotted, to look for any tendencies my data might has, e.g. left peak shows that often my nullhypothesis can be rejected. The tail on the right shows level of FDR to set for minimizing false positives. The function automatically creates a plot. The non-adjusted p-values are plotted, to look for any tendencies my data might has, e.g. left peak shows that often my null hypothesis can be rejected. The tail on the right shows level of FDR to set for minimizing false positives. The function automatically creates a plot.
```{r pval_distribution.R} ```{r pval_distribution.R, echo=FALSE}
pval_distribution_plot(data = diff_abundance_data, pval_distribution_plot(data = diff_abundance_data,
grouping = pg_protein_accessions, # correct? grouping = pg_protein_accessions, # correct?
pval = pval pval = pval
...@@ -292,15 +316,15 @@ pval_distribution_plot(data = diff_abundance_data, ...@@ -292,15 +316,15 @@ pval_distribution_plot(data = diff_abundance_data,
``` ```
## Volcano Plot ## Volcano Plot
Here a volcano plot of the comparison is drawn. The function automatically creates a plot. Here a volcano plot of the comparison is drawn. The function automatically creates a plot. INFO: Because the plot contains so many data points, it is not depicted here. Therefore it is just exported as a PDF (which you probably cannot open easily neither, so convert to PNG somehow).
```{r volcanoplot.R} ```{r volcanoplot.R, echo=FALSE}
volcanoPlot_file_name = rev(unlist(strsplit(ms_annotation_file, "/")))[1] %>% volcanoPlot_file_name_pdf = rev(unlist(strsplit(ms_annotation_file, "/")))[1] %>%
gsub("annotation_file_","",.) %>% gsub("annotation_file_","",.) %>%
gsub(".csv","",.) %>% gsub(".csv","",.) %>%
paste(.,"_volcanoPlot.pdf", sep="") paste(.,"_volcanoPlot.pdf", sep="")
graphics.off() graphics.off()
pdf(paste(MAIN_PATH, "output_files", volcanoPlot_file_name, sep="/")) pdf(paste(MAIN_PATH, "output_files", volcanoPlot_file_name_pdf, sep="/"))
print( print(
volcano_plot( volcano_plot(
data = diff_abundance_data, data = diff_abundance_data,
...@@ -317,20 +341,26 @@ volcano_plot( ...@@ -317,20 +341,26 @@ volcano_plot(
) )
) )
dev.off() dev.off()
print(paste("PDF is exported to: ",paste(MAIN_PATH, "output_files", volcanoPlot_file_name_pdf, sep="/")))
``` ```
## protti-based GO analysis ## protti-based GO analysis
I have made this this a function, where you can choose if it should calculate CC, BP, MF and if you want to have a table or a plot returned. I have made this this a function, where you can choose if it should calculate CC, BP, MF and if you want to have a table or a plot returned.
```{r protti_GO_analysis.R} ```{r protti_GO_analysis.R, echo=FALSE}
go_cc_protti_analysis = protti_GO_analysis(diff_abundance_significant, "CC", T) go_cc_protti_analysis = protti_GO_analysis(diff_abundance_significant, "CC", F)
go_cc_protti_analysis = protti_GO_analysis(diff_abundance_significant, "BP", T) print("protti - Cellular Compartment")
go_cc_protti_analysis = protti_GO_analysis(diff_abundance_significant, "MF", T) knitr::kable(go_cc_protti_analysis)
go_bp_protti_analysis = protti_GO_analysis(diff_abundance_significant, "BP", F)
print("protti - Biological Process")
knitr::kable(go_bp_protti_analysis)
go_mf_protti_analysis = protti_GO_analysis(diff_abundance_significant, "MF", F)
print("protti - Molecular Function")
knitr::kable(go_mf_protti_analysis)
``` ```
## Rank the proteins for GSEA ## Rank the proteins for GSEA
PREREQUISITES: rank proteins per condition based on their t-statistics. For Gene Set Enrichment Analysis (GSEA) rank proteins per condition based on their t-statistics.
```{r rank_proteins.R} ```{r rank_proteins.R, include = FALSE}
ranked_proteins = as.numeric(diff_abundance_data_DEsubset$t_statistic) ranked_proteins = as.numeric(diff_abundance_data_DEsubset$t_statistic)
names(ranked_proteins) = diff_abundance_data_DEsubset$pg_protein_groups names(ranked_proteins) = diff_abundance_data_DEsubset$pg_protein_groups
ranked_proteins = sort(ranked_proteins) ranked_proteins = sort(ranked_proteins)
...@@ -341,22 +371,69 @@ One can choose to run GSEA on GO terms, KEGG pathways or Mycobrowser categories. ...@@ -341,22 +371,69 @@ One can choose to run GSEA on GO terms, KEGG pathways or Mycobrowser categories.
TO DO: check that none of these commands are dependent on the result of the protti_GO_analysis.R script. TO DO: check that none of these commands are dependent on the result of the protti_GO_analysis.R script.
There is also a protti function for the same task, however this would need more data transformation and reshuffeling (I did it but it's not very straight forward and therefore error prone). There is also a protti function for the same task, however this would need more data transformation and reshuffeling (I did it but it's not very straight forward and therefore error prone).
Additionally, I want all GSEA to be calculated the same way. Additionally, I want all GSEA to be calculated the same way.
First transform some data so GSEA of KEGG and Mycobrowser work.
```{r GSEA.R}
GO_CC_GSEA = run_GSEA(ranked_proteins, "GO_CC", 1, MAIN_PATH, kegg_list, mycobrowser_data)
GO_BP_GSEA = run_GSEA(ranked_proteins, "GO_BP", 1, MAIN_PATH, kegg_list, mycobrowser_data)
GO_MF_GSEA = run_GSEA(ranked_proteins, "GO_MF", 1, MAIN_PATH, kegg_list, mycobrowser_data)
KEGG_GSEA = run_GSEA(ranked_proteins, "KEGG", 1, MAIN_PATH, kegg_list, mycobrowser_data)
MYB_FC = run_GSEA(ranked_proteins, "MYCOBROWSER_FC", 1, MAIN_PATH, kegg_list, mycobrowser_data)
MYB_FU = run_GSEA(ranked_proteins, "MYCOBROWSER_FU", 1, MAIN_PATH, kegg_list, mycobrowser_data)
MYB_PR = run_GSEA(ranked_proteins, "MYCOBROWSER_PR", 1, MAIN_PATH, kegg_list, mycobrowser_data)
```
## Plot GSEA results
First transform some data so GSEA of KEGG and Mycobrowser work First transform some data so GSEA of KEGG and Mycobrowser work
```{r GSEA.R} ```{r plot_GSEA.R, echo=FALSE}
GO_CC_GSEA = run_GSEA(ranked_proteins, "GO_CC", 0.01, MAIN_PATH, kegg_list, mycobrowser_data)
GO_BP_GSEA = run_GSEA(ranked_proteins, "GO_BP", 0.01, MAIN_PATH, kegg_list, mycobrowser_data) enrichplot::dotplot(GO_CC_GSEA, x="NES", orderBy="p.adjust", title="GO - Cellular Compartment")
GO_MF_GSEA = run_GSEA(ranked_proteins, "GO_MF", 0.01, MAIN_PATH, kegg_list, mycobrowser_data) # Display the data frame using kable
knitr::kable(GO_CC_GSEA@result[GO_CC_GSEA@result$p.adjust<=0.05,c("setSize","enrichmentScore","NES","p.adjust")])
enrichplot::dotplot(GO_BP_GSEA, x="NES", orderBy="p.adjust", title="GO - Biological Process")
# Display the data frame using kable
knitr::kable(GO_BP_GSEA@result[GO_BP_GSEA@result$p.adjust<=0.05,c("setSize","enrichmentScore","NES","p.adjust")])
enrichplot::dotplot(GO_MF_GSEA, x="NES", orderBy="p.adjust", title="GO - Molecular Function")
# Display the data frame using kable
knitr::kable(GO_MF_GSEA@result[GO_MF_GSEA@result$p.adjust<=0.05,c("setSize","enrichmentScore","NES","p.adjust")])
enrichplot::dotplot(MYB_FC, x="NES", orderBy="p.adjust", title="GO - Functional Category")
# Display the data frame using kable
knitr::kable(MYB_FC@result[MYB_FC@result$p.adjust<=0.05,c("setSize","enrichmentScore","NES","p.adjust")])
enrichplot::dotplot(MYB_FU, x="NES", orderBy="p.adjust", title="GO - Function")
# Display the data frame using kable
knitr::kable(MYB_FU@result[MYB_FU@result$p.adjust<=0.05,c("setSize","enrichmentScore","NES","p.adjust")])
enrichplot::dotplot(MYB_PR, x="NES", orderBy="p.adjust", title="GO - Product")
# Display the data frame using kable
knitr::kable(MYB_PR@result[MYB_PR@result$p.adjust<=0.05,c("setSize","enrichmentScore","NES","p.adjust")])
enrichplot::dotplot(KEGG_GSEA[["mtu"]], x="NES", orderBy="p.adjust", title = "KEGG - mtu")
# Display the data frame using kable
knitr::kable(KEGG_GSEA[["mtu"]]@result[KEGG_GSEA[["mtu"]]@result$p.adjust<=0.05,c("setSize","enrichmentScore","NES","p.adjust")])
enrichplot::dotplot(KEGG_GSEA[["mtv"]], x="NES", orderBy="p.adjust", title = "KEGG - mtv")
# Display the data frame using kable
knitr::kable(KEGG_GSEA[["mtv"]]@result[KEGG_GSEA[["mtv"]]@result$p.adjust<=0.05,c("setSize","enrichmentScore","NES","p.adjust")])
enrichplot::dotplot(KEGG_GSEA[["mtc"]], x="NES", orderBy="p.adjust", title = "KEGG - mtc")
# Display the data frame using kable
knitr::kable(KEGG_GSEA[["mtc"]]@result[KEGG_GSEA[["mtc"]]@result$p.adjust<=0.05,c("setSize","enrichmentScore","NES","p.adjust")])
KEGG_GSEA = run_GSEA(ranked_proteins, "KEGG", 0.01, MAIN_PATH, kegg_list, mycobrowser_data) # for(org in names(KEGG_GSEA)){
# if(nrow(KEGG_GSEA[[org]]@result)>0){
# enrichplot::dotplot(KEGG_GSEA[[org]], x="NES", orderBy="p.adjust", title = paste0("KEGG - ", org))
# }
# }
MYB_FC = run_GSEA(ranked_proteins, "MYCOBROWSER_FC", 0.01, MAIN_PATH, kegg_list, mycobrowser_data)
MYB_FU = run_GSEA(ranked_proteins, "MYCOBROWSER_FU", 0.01, MAIN_PATH, kegg_list, mycobrowser_data)
MYB_PR = run_GSEA(ranked_proteins, "MYCOBROWSER_PR", 0.01, MAIN_PATH, kegg_list, mycobrowser_data)
``` ```
## Summarize GSEA results ## Summarize GSEA results
```{r summarize_GSEA_makedf.R, include=TRUE} ```{r summarize_GSEA_makedf.R, include=FALSE}
GSEA_overall_results = data.frame(matrix(ncol=11, nrow=0)) GSEA_overall_results = data.frame(matrix(ncol=11, nrow=0))
GSEA_overall_results_colnames = c("ID","Description","setSize","enrichmentScore","NES","pvalue","p.adjust","qvalues","rank","leading_edge","core_enrichment")#,"Category") GSEA_overall_results_colnames = c("ID","Description","setSize","enrichmentScore","NES","pvalue","p.adjust","qvalues","rank","leading_edge","core_enrichment")#,"Category")
colnames(GSEA_overall_results) = GSEA_overall_results_colnames colnames(GSEA_overall_results) = GSEA_overall_results_colnames
...@@ -364,7 +441,6 @@ colnames(GSEA_overall_results) = GSEA_overall_results_colnames ...@@ -364,7 +441,6 @@ colnames(GSEA_overall_results) = GSEA_overall_results_colnames
GSEA_overall_results = GSEA_summarize(GSEA_overall_results, GO_CC_GSEA)#, "GO_CC") GSEA_overall_results = GSEA_summarize(GSEA_overall_results, GO_CC_GSEA)#, "GO_CC")
GSEA_overall_results = GSEA_summarize(GSEA_overall_results, GO_BP_GSEA)#, "GO_BP") GSEA_overall_results = GSEA_summarize(GSEA_overall_results, GO_BP_GSEA)#, "GO_BP")
GSEA_overall_results = GSEA_summarize(GSEA_overall_results, GO_MF_GSEA)#, "GO_MF") GSEA_overall_results = GSEA_summarize(GSEA_overall_results, GO_MF_GSEA)#, "GO_MF")
print(GSEA_overall_results)
for(name in names(KEGG_GSEA)){ for(name in names(KEGG_GSEA)){
GSEA_overall_results = GSEA_summarize(GSEA_overall_results, KEGG_GSEA[[name]])} #, paste("KEGG_",name,sep=""))} GSEA_overall_results = GSEA_summarize(GSEA_overall_results, KEGG_GSEA[[name]])} #, paste("KEGG_",name,sep=""))}
...@@ -375,35 +451,35 @@ GSEA_overall_results = GSEA_summarize(GSEA_overall_results, MYB_PR)#, "MYB_PR") ...@@ -375,35 +451,35 @@ GSEA_overall_results = GSEA_summarize(GSEA_overall_results, MYB_PR)#, "MYB_PR")
row.names(GSEA_overall_results) = NULL row.names(GSEA_overall_results) = NULL
write.csv(GSEA_overall_results, paste(MAIN_PATH, "output_files/intermed_op3.csv", sep="/"), quote = F, col.names = T, row.names = F, sep = ",") write.table(unique(GSEA_overall_results), paste(MAIN_PATH, "output_files/intermed_GSEA_overall_results_scicore_cluster_230901.tsv", sep="/"), quote = F, col.names = T, row.names = F, sep = "|")
``` ```
## Write down GSEA results ## Write down GSEA results
```{r write_GSNEA.R, include=TRUE} ```{r write_GSNEA.R, echo=FALSE}
GSEA_overall_results_short = GSEA_overall_results[,c("ID","p.adjust")] GSEA_overall_results_short = GSEA_overall_results[,c("ID","enrichmentScore","NES","pvalue","p.adjust","qvalues")]
# Display the data frame using kable # Display the data frame using kable
knitr::kable(GSEA_overall_results_short) knitr::kable(unique(GSEA_overall_results_short[order(GSEA_overall_results_short$p.adjust),]))
GSEA_output_file_name = rev(unlist(strsplit(ms_annotation_file, "/")))[1] %>% GSEA_output_file_name = rev(unlist(strsplit(ms_annotation_file, "/")))[1] %>%
gsub("annotation_file_","",.) %>% gsub("annotation_file_","",.) %>%
gsub(".csv","",.) %>% gsub(".csv","",.) %>%
paste(.,"_GSEA_all_results.csv", sep="") paste(.,"_GSEA_all_results.tsv", sep="")
write.csv(GSEA_overall_results, paste(MAIN_PATH, "output_files",GSEA_output_file_name, sep="/"), quote = F, col.names = T, row.names = F, sep = ",") write.table(unique(GSEA_overall_results), paste(MAIN_PATH, "output_files",GSEA_output_file_name, sep="/"), quote = F, col.names = T, row.names = F, sep = "|")
print("table should be saved now.") print("here your current environment variables:")
print(ls()) print(ls())
Sys.sleep(5) Sys.sleep(5)
``` ```
## Show SessionInfo ## Show SessionInfo
```{r writeSessionInfo.R, include=TRUE} ```{r writeSessionInfo.R, echo=FALSE}
sessionInfo() sessionInfo()
``` ```
# End of Pipeline Analysis. But what to do with Quick Go data? Or is pipeline using it? # END-OF-ANALYSIS
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment