diff --git a/R/run_GSEA.R b/R/run_GSEA.R index 1a8f55897c3ef66f7129ea9daa99d042e790830c..c7ba63ece3bf9210deb22c9adc069d44651463de 100644 --- a/R/run_GSEA.R +++ b/R/run_GSEA.R @@ -15,7 +15,7 @@ kegg_to_t2g = function(data_set){ GSEA_kegg_list <- list() for (org_id in kegg_orgids){ tmp_subset <- data_set[[org_id]] - tmp_subset$label_joined <- paste(tmp_subset$pathway_id, tmp_subset$pathway_name, sep="_") + tmp_subset$label_joined <- paste(gsub("path:","",tmp_subset$pathway_id), gsub(" - Mycobacterium tuberculosis CDC1551| - Mycobacterium tuberculosis H37Rv","",tmp_subset$pathway_name), sep="_") GSEA_kegg_list[[org_id]] <- tmp_subset[,c("label_joined","uniprot_id")] } return(GSEA_kegg_list) @@ -44,31 +44,32 @@ kegg_to_t2g = function(data_set){ run_GSEA = function(data_set, kingdom, pvalCOff,MAIN_PATH, kegg_list,mycobrowser_data){ + set.seed(1234) if(!(kingdom %in% c("GO_CC","GO_BP","GO_MF", "KEGG", "MYCOBROWSER_FC", "MYCOBROWSER_FU", "MYCOBROWSER_PR"))){print("WARNING, choose kingdom you want to analyze. Options:\"GO_CC\",\"GO_BP\",\"GO_MF\", \"KEGG\", \"MYCOBROWSER_FC\", \"MYCOBROWSER_FU\", \"MYCOBROWSER_PR\".")} if(kingdom == "GO_CC"){ print("You are running GSEA on GeneOntology category \"Cellular Compartment\".") load(paste(MAIN_PATH, "annotation_files", "cc_table_for_gmt.Rdata", sep="/")) # loads the data set: cc_table_for_gmt - return(GSEA(rev(data_set), minGSSize = 1, maxGSSize = (Inf), eps = 0, pvalueCutoff = pvalCOff, pAdjustMethod = "BH", TERM2GENE=cc_table_for_gmt, by="fgsea")) #cutoff for adjusted pvalue + return(GSEA(rev(data_set), minGSSize = 1, maxGSSize = (Inf), eps = 0, pvalueCutoff = pvalCOff, pAdjustMethod = "BH", TERM2GENE=cc_table_for_gmt, by="fgsea", seed=T)) #cutoff for adjusted pvalue } if(kingdom == "GO_BP"){ print("You are running GSEA on GeneOntology category \"Biological Process\".") load(paste(MAIN_PATH, "annotation_files", "bp_table_for_gmt.Rdata", sep="/")) # loads the data set: bp_table_for_gmt - return(GSEA(rev(data_set), minGSSize = 1, maxGSSize = (Inf), eps = 0, pvalueCutoff = pvalCOff, pAdjustMethod = "BH", TERM2GENE=bp_table_for_gmt, by="fgsea")) + return(GSEA(rev(data_set), minGSSize = 1, maxGSSize = (Inf), eps = 0, pvalueCutoff = pvalCOff, pAdjustMethod = "BH", TERM2GENE=bp_table_for_gmt, by="fgsea", seed=T)) } if(kingdom == "GO_MF"){ print("You are running GSEA on GeneOntology category \"Molecular Function\".") load(paste(MAIN_PATH, "annotation_files", "mf_table_for_gmt.Rdata", sep="/")) # loads the data set: mf_table_for_gmt - return(GSEA(rev(data_set), minGSSize = 1, maxGSSize = (Inf), eps = 0, pvalueCutoff = pvalCOff, pAdjustMethod = "BH", TERM2GENE=mf_table_for_gmt, by="fgsea")) + return(GSEA(rev(data_set), minGSSize = 1, maxGSSize = (Inf), eps = 0, pvalueCutoff = pvalCOff, pAdjustMethod = "BH", TERM2GENE=mf_table_for_gmt, by="fgsea", seed=T)) } if(kingdom == "KEGG"){ - print("You are running GSEA on kegg pathways. The code runs for every Mtb organism ID on KEGG separately, so the output will be a list, not a table as when running the function for GO terms.") + print("You are running GSEA on kegg pathways. The code runs for every Mtb organism ID on KEGG separately, so the output will be a list, not a table as when running the function for GO terms. Some KEGG organism IDs have contain different protein identifiers, not recognized by the GSEA Function. This could be corrected, but because they are highly redundant, this is not done here. Most important are results from organism IDs: mtu & mtv, which both are Mtb H37Rv.") GSEA_kegg_list = kegg_to_t2g(kegg_list) GSEA_kegg_result_list <- list() kegg_orgids <- names(GSEA_kegg_list) for (org_id in kegg_orgids){ message(paste("Running GSEA on comparison_",org_id," right now. Computing...")) runfun <- tryCatch( - res_GSEA <- GSEA(rev(data_set), minGSSize = 1, maxGSSize = (Inf), eps = 0, pvalueCutoff = pvalCOff, pAdjustMethod = "BH", TERM2GENE=GSEA_kegg_list[[org_id]], by="fgsea"), + res_GSEA <- GSEA(rev(data_set), minGSSize = 1, maxGSSize = (Inf), eps = 0, pvalueCutoff = pvalCOff, pAdjustMethod = "BH", TERM2GENE=GSEA_kegg_list[[org_id]], by="fgsea", seed=T), error = function(e) { message(paste("Running GSEA on the comparison: ",org_id," ended up in error...And Now for Something Completely Different.")) NA @@ -81,17 +82,17 @@ run_GSEA = function(data_set, kingdom, pvalCOff,MAIN_PATH, kegg_list,mycobrowser if(kingdom == "MYCOBROWSER_FC"){ print("You are running GSEA on mycobrowser category \"Functional Category\".") mycobrowser_FunCat <- unique(mycobrowser_data[,c("Functional_Category","UniProt_AC")]) - return(GSEA(rev(data_set), minGSSize = 1, maxGSSize = (Inf), eps = 0, pvalueCutoff = pvalCOff, pAdjustMethod = "BH", TERM2GENE=mycobrowser_FunCat, by="fgsea")) + return(GSEA(rev(data_set), minGSSize = 1, maxGSSize = (Inf), eps = 0, pvalueCutoff = pvalCOff, pAdjustMethod = "BH", TERM2GENE=mycobrowser_FunCat, by="fgsea", seed=T)) } if(kingdom == "MYCOBROWSER_FU"){ print("You are running GSEA on mycobrowser category \"Function\".") mycobrowser_Function <- unique(mycobrowser_data[,c("Function","UniProt_AC")]) - return(GSEA(rev(data_set), minGSSize = 1, maxGSSize = (Inf), eps = 0, pvalueCutoff = pvalCOff, pAdjustMethod = "BH", TERM2GENE=mycobrowser_Function, by="fgsea")) + return(GSEA(rev(data_set), minGSSize = 1, maxGSSize = (Inf), eps = 0, pvalueCutoff = pvalCOff, pAdjustMethod = "BH", TERM2GENE=mycobrowser_Function, by="fgsea", seed=T)) } if(kingdom == "MYCOBROWSER_PR"){ print("You are running GSEA on mycobrowser category \"Product\".") mycobrowser_Product <- unique(mycobrowser_data[,c("Product","UniProt_AC")]) - return(GSEA(rev(data_set), minGSSize = 1, maxGSSize = (Inf), eps = 0, pvalueCutoff = pvalCOff, pAdjustMethod = "BH", TERM2GENE=mycobrowser_Product, by="fgsea")) + return(GSEA(rev(data_set), minGSSize = 1, maxGSSize = (Inf), eps = 0, pvalueCutoff = pvalCOff, pAdjustMethod = "BH", TERM2GENE=mycobrowser_Product, by="fgsea", seed=T)) } } diff --git a/R/run_QC.R b/R/run_QC.R index 5b14f82681793b7e04e2c9ee6f3739bee1070a99..de2d7f32427095e85e20a56c3d807f5fe927a254 100644 --- a/R/run_QC.R +++ b/R/run_QC.R @@ -25,6 +25,7 @@ run_QC = function(input_data,qc_level){ input_data$norm_quantity <- 2^(input_data$normalised_intensity_log2) # 4.3 CVs print(qc_cvs(data = input_data, grouping = pep_grouping_key, condition = r_condition, intensity = pep_quantity, plot = TRUE, plot_style = "violin")) + print("Plot below shows CVs post normalization by 'pep_quantity'.") print(qc_cvs(data = input_data, grouping = pep_grouping_key, condition = r_condition, intensity = norm_quantity, plot = TRUE, plot_style = "violin")) # 4.4 PCA qc_pca( data = input_data, sample = r_file_name, grouping = pep_grouping_key, intensity = normalised_intensity_log2, condition = r_condition, digestion = NULL, plot_style = "scree" ) print(qc_pca(data = input_data, sample = r_file_name, grouping = pep_grouping_key, intensity = normalised_intensity_log2, condition = r_condition, components= c("PC1","PC2"), plot_style = "pca" ) ) @@ -35,6 +36,7 @@ run_QC = function(input_data,qc_level){ print(qc_ids(data = input_data, sample = r_file_name, grouping = pep_grouping_key, intensity = norm_quantity, condition = r_condition, title = "Peptide identifications per sample", plot = TRUE ) ) # 4.7 intensity distribution print(qc_intensity_distribution(data = input_data, sample = r_file_name, grouping = pep_grouping_key, intensity_log2 = transformed_quantity_log2 , plot_style = "boxplot") ) + print("Plot below shows intensity distribution post normalization by 'pep_quantity'.") print(qc_intensity_distribution(data = input_data, sample = r_file_name, grouping = pep_grouping_key, intensity_log2 = normalised_intensity_log2, plot_style = "boxplot")) # 4.8 data completeness print(qc_data_completeness(data = input_data, sample = r_file_name, grouping = pep_grouping_key, intensity = normalised_intensity_log2, plot = TRUE) ) @@ -48,6 +50,7 @@ run_QC = function(input_data,qc_level){ input_data$norm_quantity <- 2^(input_data$normalised_intensity_log2) # 4.3 CVs print(qc_cvs(data = input_data, grouping = pg_protein_groups, condition = r_condition, intensity = pg_quantity, plot = TRUE, plot_style = "violin")) + print("Plot below shows CVs post normalization by 'pg_quantity'.") print(qc_cvs(data = input_data, grouping = pg_protein_groups, condition = r_condition, intensity = norm_quantity, plot = TRUE, plot_style = "violin")) # 4.4 PCA qc_pca( data = input_data, sample = r_file_name, grouping = pg_protein_groups, intensity = normalised_intensity_log2, condition = r_condition, digestion = NULL, plot_style = "scree" ) print(qc_pca(data = input_data, sample = r_file_name, grouping = pg_protein_groups, intensity = normalised_intensity_log2, condition = r_condition, components= c("PC1","PC2"), plot_style = "pca" ) ) @@ -58,6 +61,7 @@ run_QC = function(input_data,qc_level){ print(qc_ids(data = input_data, sample = r_file_name, grouping = pg_protein_groups, intensity = norm_quantity, condition = r_condition, title = "Peptide identifications per sample", plot = TRUE ) ) # 4.7 intensity distribution print(qc_intensity_distribution(data = input_data, sample = r_file_name, grouping = pg_protein_groups, intensity_log2 = transformed_quantity_log2 , plot_style = "boxplot") ) + print("Plot below shows intensity distribution post normalization by 'pg_quantity'.") print(qc_intensity_distribution(data = input_data, sample = r_file_name, grouping = pg_protein_groups, intensity_log2 = normalised_intensity_log2, plot_style = "boxplot") ) # 4.8 data completeness print(qc_data_completeness(data = input_data, sample = r_file_name, grouping = pg_protein_groups, intensity = normalised_intensity_log2, plot = TRUE) ) diff --git a/inst/annotation_files/PROTEOME_PIPELINE_MUTANTMATRIX.txt b/inst/annotation_files/PROTEOME_PIPELINE_MUTANTMATRIX.txt index f70cbb199aa1954d2d7f8ff646813aef56285994..8a79a4d7b1ab1cd679d853277f47391eb0dcc754 100644 --- a/inst/annotation_files/PROTEOME_PIPELINE_MUTANTMATRIX.txt +++ b/inst/annotation_files/PROTEOME_PIPELINE_MUTANTMATRIX.txt @@ -9,8 +9,8 @@ N2030,N0157,L1,HF,L1_HF,rpoB,S450L,na,1 N2031,N0157,L1,LF,L1_LF,rpoB,H445R,na,1 N1888,N0145,L2,HF,L2_HF,rpoB,S450L,na,1 N1890,N0145,L2,LF,L2_LF,rpoB,H445R,na,1 -N2502,N1283,L4,HF,L4_HF,rpoB,S450L,na,1 -N2850,N1283,L4,LF,L4_LF,rpoB,H445R,na,1 +N2502,N1283,L4,LF,L4_LF,rpoB,H445R,na,1 +N2850,N1283,L4,HF,L4_HF,rpoB,S450L,na,1 N0157,N0157,L1,WT,L1_WT,WT,WT,1,0 N0145,N0145,L2,WT,L2_WT,WT,WT,1,0 -N1283,N1283,L4,WT,L4_WT,WT,WT,1,0 \ No newline at end of file +N1283,N1283,L4,WT,L4_WT,WT,WT,1,0 diff --git a/inst/rmarkdown/Mtb_Proteomics_Pipeline_worksheet.Rmd b/inst/rmarkdown/Mtb_Proteomics_Pipeline_worksheet.Rmd index e7dd22e0c76086cda600dd0c977854d12a5f5069..b01896b5d1717b59e1a4cb4c5f01a925d66ba57d 100644 --- a/inst/rmarkdown/Mtb_Proteomics_Pipeline_worksheet.Rmd +++ b/inst/rmarkdown/Mtb_Proteomics_Pipeline_worksheet.Rmd @@ -12,9 +12,7 @@ knitr::opts_chunk$set(echo = TRUE, message = TRUE) ``` -# Proteome analysis pipeline - -### Main goal +### How to run the pipeline Run the pipeline with the following code. Example for comparison of WT to S450L. ```{bash main_goal, eval=FALSE} sbatch --job-name JOB_NAME SLURM_SCRIPT.sh /sciCore/MAIN/PATH OUTPUTFILE_NAME EXPERIMENT_NAME ANNOTATION_FILE.csv @@ -22,77 +20,59 @@ e.g.: sbatch --job-name tbeeprotpip_WT_vs_S450L tbeeprotpip_slurm_script.sh /scicore/home/gagneux/bousel00/tb/TbX015 WT_vs_S450L_output TbX015 annotation_file_wt_vs_s450l.csv ``` -### Split the work in four parts +### Analysis pipeline * Load, filter and merge data * QC peptide level * QC protein level * Differential expression analysis - - Comparison level(s), defined by the composition of the annotation file + - Comparison level(s) are defined by the composition of the annotation file * Enrichment Analysis - Annotations - Gene Ontology - - MF - - CC - - BP + - Molecular Function + - Cellular Compartment + - Biological Process - KEGG Pathways - Mycobrowser - - FC + - Functional Category - Function - Product -### Thoughts and Comments -From <https://raps-with-r.dev/project_rewrite.html> I learn that I can shorten my script a lot and make it more easy readable and executable. -There the author also suggests to work with the {target} R package, which is also introduced here <https://books.ropensci.org/targets/walkthrough.html>. - -More concrete: -* work with git, to learn it, and because it will be safer in case of messing up [check] -* make separate R-scripts: [check] - - where you define all functions. source this script in the code [check] - - where you load, filter and merge the data [check] - - QC peptide-level (can maybe write a function for QC and then decide the level in function-input) [check] - - QC protein-level (can maybe write a function for QC and then decide the level in function-input) [check] - - Enrichment Analysis [check] -* modify for loops into apply functions, to make the script more robust (examples on <https://raps-with-r.dev/project_rewrite.html>) [no] -* for the script to run faster and to use less memory, make sure you unload all data-sets as soon as they are not needed in the work environment anymore (can check all loaded objects with {ls()}) [to_do] -* make the script write it's own report with comments, plots and tables. [check] -* try to make it work with target, so independent jobs can run in parallel and you can run the script on multiple cores in parallel (e.g. the comparison-levels) [to look into] -* provide a folder in the git-folder, where you have all annotation files [check] -* providing the extraction-OD data, I can also further test for correlation between CV and variability between extraction-ODs of the strain, and add this to the QC [to_do] -* write in report what package versions were used and what annotation data files (as they contain the date) [to_do] -* maybe it helps when reading in the mutant matrix, to make a list of it, for easier access of elements? [no] - -# load all packages -```{r loading_the_packages} -library("protti") -library("magrittr") -library("dplyr") # from tidyverse -library("tidyr") # from tidyverse -library("ggplot2") # from tidyverse -library("purrr") # from tidyverse -library("gplots") -library("ggpubr") -library("VennDiagram") -library("ggvenn") -library("KEGGREST") -library("Peptides") -library("stringr") -library("fgsea") -library("ggnewscale") -library("clusterProfiler") -library("pathview") -library("enrichplot") -library("biomaRt") -library("ggrepel") -library("msigdbr") -library("ggridges") -library("lattice") #to print pdfs from loops +TODO: providing the extraction-OD data, I can also further test for correlation between CV and variability between extraction-ODs of the strain, and add this to the QC [to_do] + +# Load all packages +```{r loading_the_packages, message=FALSE, warning=FALSE} +library(protti) +library(magrittr) +library(dplyr) # from tidyverse +library(tidyr) # from tidyverse +library(ggplot2) # from tidyverse +library(purrr) # from tidyverse +library(gplots) +library(ggpubr) +library(VennDiagram) +library(ggvenn) +library(KEGGREST) +library(Peptides) +library(stringr) +library(fgsea) +library(ggnewscale) +library(clusterProfiler) +library(pathview) +library(enrichplot) +library(biomaRt) +library(ggrepel) +library(msigdbr) +library(ggridges) +library(lattice) #to print pdfs from loops +#library(Cairo) +#library(png) ``` ## Load file paths The file paths are loaded from the command line input. The variables are put in on the comand line, the params argument from the rmarkdown::render() function passes the values into the Rscript and they can here be accessed via params$variableINeed. -```{r loading_the_paths} - +```{r loading_the_paths, echo = FALSE} #MAIN_PATH = "/scicore/home/gagneux/bousel00/tb/TbX015" MAIN_PATH = params$MAIN_PATH @@ -110,20 +90,28 @@ mutant_matrix = system.file("annotation_files", "PROTEOME_PIPELINE_MUTANTMATRIX. if (mutant_matrix == "") { stop(paste("Could not find `PROTEOME_PIPELINE_MUTANTMATRIX.txt`. Try re-installing `tbeeprotpip`."), call. = FALSE) } + +print(paste("MAIN_PATH:", MAIN_PATH)) +print(paste("EXPERIMENT:", EXPERIMENT)) +print(paste("ms_annotation_file:", ms_annotation_file)) +print(paste("mutant_matrix:", mutant_matrix)) ``` ## Load the data Here you load the Spectronaut output and the annotation data. -```{r load_data_spec.R} +```{r load_data_spec.R, include=FALSE} base_spectronaut_data <- read_protti(filename = spectronaut_xls_file, sep="\t") ``` -```{r load_data_annot.R} +```{r load_data_annot.R, echo=FALSE} #The annotation data contains only information about the samples and conditions of interest. annot <- read.csv(ms_annotation_file, header = TRUE, sep=";") + +# Display the data frame using kable +knitr::kable(annot) ``` -```{r load_data_mutantmat.R} +```{r load_data_mutantmat.R, echo=FALSE} # load the extraction-OD data @@ -131,20 +119,22 @@ annot <- read.csv(ms_annotation_file, header = TRUE, sep=";") # load the mutant matrix information mutantmatrix_data <- read.csv(mutant_matrix, sep=",", header=T) +# Display the data frame using kable +knitr::kable(mutantmatrix_data) ``` The function 'latest_file' will load the latest version of a file with a defined name and the date it was generated. -```{r load_data_uniprpt.R} +```{r load_data_uniprpt.R, include=FALSE} # load Uniprot annotation (excluding GO terms) | see script fetch_uniprot_separate_script.r load(latest_file(MAIN_PATH,"uniprot")) ``` -```{r load_data_mycobrow.R} +```{r load_data_mycobrow.R, include=FALSE} # load Mycobrowser annotation | downloaded from mycobrowser website on 24. May 2022 mycobrowser_data <- read.csv("/scicore/home/gagneux/bousel00/tb/TbX015/annotation_files/mycobrowser_Mycobacterium_tuberculosis_H37Rv_txt_v4.txt", header=T, sep="\t") ``` -```{r load_data_kegg.R} +```{r load_data_kegg.R, include=FALSE} # load KEGG annotation load(latest_file(MAIN_PATH,"kegg")) # loads the data set: kegg_list @@ -155,8 +145,8 @@ load(latest_file(MAIN_PATH,"kegg")) # loads the data set: kegg_list #load(paste(MAIN_PATH, "annotation_files", "GSEA_kegg_list.Rdata", sep="/")) # loads the data set: GSEA_kegg_list ``` -TO DO: think about what to do with the QuickGO data...and where the uniprot GO data stems from. -```{r load_data_qg.R} +TODO: check again if the QuickGO data is preferable to the uniprot GO (and how it is downloaded) +```{r load_data_qg.R, eval=FALSE} # load QuickGO annotation (all three domains) quick_go_data <- read.csv("/scicore/home/gagneux/bousel00/tb/TbX015/annotation_files/QuickGO_annotations_MTBC_1773_ReviewedAndUnreviewed_20220722.tsv", header=T, sep="\t") ``` @@ -171,18 +161,22 @@ go_mf_gmt = make_gmt(uniprot, "go_molecular_function") # uses my defined functio ``` ## Load data for GSEA -TO DO: These data sets remain to be generated from the gmt file...WHY DID I WRITE THIS DOWN? CHECK! -```{r load_data_for_GSEA} +TODO: These data sets remain to be generated from the gmt file...WHY DID I WRITE THIS DOWN? CHECK! +```{r load_data_for_GSEA, echo=FALSE} load(paste(MAIN_PATH, "annotation_files", "cc_table_for_gmt.Rdata", sep="/")) # loads the data set: cc_table_for_gmt load(paste(MAIN_PATH, "annotation_files", "bp_table_for_gmt.Rdata", sep="/")) # loads the data set: bp_table_for_gmt load(paste(MAIN_PATH, "annotation_files", "mf_table_for_gmt.Rdata", sep="/")) # loads the data set: mf_table_for_gmt + +print(paste(MAIN_PATH, "annotation_files", "cc_table_for_gmt.Rdata", sep="/")) +print(paste(MAIN_PATH, "annotation_files", "bp_table_for_gmt.Rdata", sep="/")) +print(paste(MAIN_PATH, "annotation_files", "mf_table_for_gmt.Rdata", sep="/")) ``` ## Filtering the data for Quality Control (QC) Here I remove the "r_replicate" column, because it will be added later on with the proper replicate numbers. Then I remove the base_spectronaut_data from the environment to save RAM. -Then I remove all decoy peptides and reshuffle the annot file. -```{r filtering_for_QC.R} +Then I remove all decoy peptides and reshuffle the columns of the annot file, for working with it. +```{r filtering_for_QC.R, include=FALSE} base_spectronaut_data_reduced <- subset(base_spectronaut_data, select=-c(r_replicate)) rm(base_spectronaut_data) @@ -201,8 +195,8 @@ annot$r_replicate <- c(1:dim(annot)[1]) ``` ## Annotate Spectronaut-Data -Here I annotate the data. As each line in "base_spectronaut_data_reduced" matches at most one line in y, choose relationship = "many-to-one". Then filter the data, so only the rows of interest are kept, menaing only the ones which got an entry from the merging with the annotation data. Then remove the two old data sets, to not overload the environment memory -```{r annotating_with_MS_annotation.R} +Here I annotate the data. As each line in "base_spectronaut_data_reduced" matches at most one line in y, choose relationship = "many-to-one". Then filter the data, so only the rows of interest are kept, meaning only the ones which got an entry from the merging with the annotation data. Then remove the two old data sets, to sve RAM. +```{r annotating_with_MS_annotation.R, include=FALSE} spectronaut_data <- base_spectronaut_data_reduced %>% full_join(annot, by="r_file_name", relationship = "many-to-one") %>% # filter rows where "Condition" is empty @@ -217,28 +211,28 @@ rm(base_spectronaut_data_reduced, annot) ## Quality Control- Peptide Level Here I conduct quality control of the data. First on peptide level. Some plots are shown twice, because they first show the un-normalized data, then the normalized. -```{r pepQC.R} +```{r pepQC.R, echo=FALSE} strwrap("First QualityControl is run on peptide level.",prefix = "\n", initial = "") run_QC(spectronaut_data, "peptide") ``` ## Quality Control- Protein Level Here I conduct quality control of the data. Now on protein level. Some plots are shown twice, because they first show the un-normalized data, then the normalized. -```{r protQC.R} +```{r protQC.R, echo=FALSE} strwrap("Second QualityControl is run on protein level.",prefix = "\n", initial = "") run_QC(spectronaut_data, "protein") ``` ## Filter the data for Expression Analysis Remove the non-proteotypic peptides. -```{r filtering_for_EA.R} +```{r filtering_for_EA.R, include=FALSE} data_filtered_proteotypic <- spectronaut_data %>% filter(pep_is_proteotypic == TRUE) ``` ## Calculate the sequence coverage Here-fore, first annotate the data with the mutant matrix, which contains information needed for peptide type and sequence coverage. -```{r calculate_sequence_coverage.R} +```{r calculate_sequence_coverage.R, echo=FALSE} data_filtered_uniprot <- data_filtered_proteotypic %>% left_join(y = uniprot, by = "pg_protein_accessions") %>% @@ -263,7 +257,7 @@ qc_sequence_coverage( ## Calculate missingness and differencial abundance of proteins These functions from protti calculate the missingness based on adjustable thresholds and calculate the differential abundance of proteins. Furthermore a new column is added containing logicals if protein is significantly DE or not. To do so, the protein intensities need to be normalized first. This is already done in the QC, but not transferred from there to the global environment. -```{r calculate_missingness_and_DE.R} +```{r calculate_missingness_and_DE.R, echo=FALSE} data_filtered_uniprot$transformed_quantity_log2 = log2(data_filtered_uniprot$pg_quantity) data_filtered_uniprot <- normalise(data_filtered_uniprot, sample = r_file_name, intensity_log2 = transformed_quantity_log2, method = "median") # this adds the column "normalised_intensity_log2" data_filtered_uniprot$norm_quantity <- 2^(data_filtered_uniprot$normalised_intensity_log2) @@ -272,19 +266,49 @@ diff_abundance_data = calculate_missingness_and_DE(data_filtered_uniprot, 0.7, 0 diff_abundance_significant <- diff_abundance_data %>% mutate(is_significant = ifelse((adj_pval < 0.01), TRUE, FALSE)) + +diff_abundance_significant_file_name = rev(unlist(strsplit(ms_annotation_file, "/")))[1] %>% + gsub("annotation_file_","",.) %>% + gsub(".csv","",.) %>% + paste(.,"_GSEA_diff_abundance_significant.tsv", sep="") + +diff_abundance_significant_export_subset_file_name = rev(unlist(strsplit(ms_annotation_file, "/")))[1] %>% + gsub("annotation_file_","",.) %>% + gsub(".csv","",.) %>% + paste(.,"_GSEA_diff_abundance_subset_significant.tsv", sep="") + +# diff_abundance_significant_export_subset = unique(diff_abundance_significant[diff_abundance_significant$is_significant==TRUE,c("r_file_name","r_condition","r_replicate","pg_protein_groups","pg_protein_accessions","pg_quantity","SampleDescription","COI","n_number","length","mass","go_biological_process","go_cellular_component","go_molecular_function","go_id","cognate_wt","lin_class","mut_gene","aa_substitution","transformed_quantity_log2","norm_quantity","comparison","missingness","pval","adj_pval","diff","t_statistic","std_error","df","avg_abundance","n_obs","is_significant")]) +# take also non-significant ones, so can draw the plot +diff_abundance_significant_export_subset = unique(diff_abundance_significant[,c("r_file_name","r_condition","r_replicate","pg_protein_groups","pg_protein_accessions","pg_quantity","SampleDescription","COI","n_number","length","mass","go_biological_process","go_cellular_component","go_molecular_function","go_id","cognate_wt","lin_class","mut_gene","aa_substitution","transformed_quantity_log2","norm_quantity","comparison","missingness","pval","adj_pval","diff","t_statistic","std_error","df","avg_abundance","n_obs","is_significant")]) + +diff_abundance_significant_print_data = unique(diff_abundance_significant[diff_abundance_significant$is_significant==TRUE,c("n_number","pg_protein_groups","COI","aa_substitution","norm_quantity","comparison","adj_pval")]) + +diff_abundance_significant_print_data$tmp_data = paste(diff_abundance_significant_print_data$COI, diff_abundance_significant_print_data$pg_protein_groups, sep="_") + +means_table = aggregate(diff_abundance_significant_print_data$norm_quantity, list(diff_abundance_significant_print_data$tmp_data), FUN=mean) +names(means_table) = c("tmp_data", "mean_norm_quantity") + +diff_abundance_significant_print_data = diff_abundance_significant_print_data[,-c(1,4,5)] %>% + right_join(means_table, by="tmp_data") + +# Display the data frame using kable +knitr::kable(unique(diff_abundance_significant_print_data[,-5])) + +#write.table(data.frame(diff_abundance_significant), paste(MAIN_PATH, "output_files",diff_abundance_significant_file_name, sep="/"), quote = F, col.names = T, row.names = F, sep = "|") +write.table(data.frame(diff_abundance_significant_export_subset), paste(MAIN_PATH, "output_files",diff_abundance_significant_export_subset_file_name, sep="/"), quote = F, col.names = T, row.names = F, sep = "|") ``` ## Subset the data for GSEA Here the diff_abundance_data is subseted for only columns necessary for GSEA. -```{r subset_for_GSEA.R} +```{r subset_for_GSEA.R, include=FALSE} diff_abundance_data_DEsubset <- unique(diff_abundance_data[c("pg_protein_groups","comparison","diff","t_statistic","adj_pval")]) ``` ## Calculate and plot the p-value distribution This can already tell something about the data. -The non-adjusted p-values are plotted, to look for any tendencies my data might has, e.g. left peak shows that often my nullhypothesis can be rejected. The tail on the right shows level of FDR to set for minimizing false positives. The function automatically creates a plot. -```{r pval_distribution.R} +The non-adjusted p-values are plotted, to look for any tendencies my data might has, e.g. left peak shows that often my null hypothesis can be rejected. The tail on the right shows level of FDR to set for minimizing false positives. The function automatically creates a plot. +```{r pval_distribution.R, echo=FALSE} pval_distribution_plot(data = diff_abundance_data, grouping = pg_protein_accessions, # correct? pval = pval @@ -292,15 +316,15 @@ pval_distribution_plot(data = diff_abundance_data, ``` ## Volcano Plot -Here a volcano plot of the comparison is drawn. The function automatically creates a plot. -```{r volcanoplot.R} -volcanoPlot_file_name = rev(unlist(strsplit(ms_annotation_file, "/")))[1] %>% +Here a volcano plot of the comparison is drawn. The function automatically creates a plot. INFO: Because the plot contains so many data points, it is not depicted here. Therefore it is just exported as a PDF (which you probably cannot open easily neither, so convert to PNG somehow). +```{r volcanoplot.R, echo=FALSE} +volcanoPlot_file_name_pdf = rev(unlist(strsplit(ms_annotation_file, "/")))[1] %>% gsub("annotation_file_","",.) %>% gsub(".csv","",.) %>% paste(.,"_volcanoPlot.pdf", sep="") graphics.off() -pdf(paste(MAIN_PATH, "output_files", volcanoPlot_file_name, sep="/")) +pdf(paste(MAIN_PATH, "output_files", volcanoPlot_file_name_pdf, sep="/")) print( volcano_plot( data = diff_abundance_data, @@ -317,20 +341,26 @@ volcano_plot( ) ) dev.off() +print(paste("PDF is exported to: ",paste(MAIN_PATH, "output_files", volcanoPlot_file_name_pdf, sep="/"))) ``` ## protti-based GO analysis I have made this this a function, where you can choose if it should calculate CC, BP, MF and if you want to have a table or a plot returned. -```{r protti_GO_analysis.R} -go_cc_protti_analysis = protti_GO_analysis(diff_abundance_significant, "CC", T) -go_cc_protti_analysis = protti_GO_analysis(diff_abundance_significant, "BP", T) -go_cc_protti_analysis = protti_GO_analysis(diff_abundance_significant, "MF", T) - +```{r protti_GO_analysis.R, echo=FALSE} +go_cc_protti_analysis = protti_GO_analysis(diff_abundance_significant, "CC", F) +print("protti - Cellular Compartment") +knitr::kable(go_cc_protti_analysis) +go_bp_protti_analysis = protti_GO_analysis(diff_abundance_significant, "BP", F) +print("protti - Biological Process") +knitr::kable(go_bp_protti_analysis) +go_mf_protti_analysis = protti_GO_analysis(diff_abundance_significant, "MF", F) +print("protti - Molecular Function") +knitr::kable(go_mf_protti_analysis) ``` ## Rank the proteins for GSEA -PREREQUISITES: rank proteins per condition based on their t-statistics. -```{r rank_proteins.R} +For Gene Set Enrichment Analysis (GSEA) rank proteins per condition based on their t-statistics. +```{r rank_proteins.R, include = FALSE} ranked_proteins = as.numeric(diff_abundance_data_DEsubset$t_statistic) names(ranked_proteins) = diff_abundance_data_DEsubset$pg_protein_groups ranked_proteins = sort(ranked_proteins) @@ -341,22 +371,69 @@ One can choose to run GSEA on GO terms, KEGG pathways or Mycobrowser categories. TO DO: check that none of these commands are dependent on the result of the protti_GO_analysis.R script. There is also a protti function for the same task, however this would need more data transformation and reshuffeling (I did it but it's not very straight forward and therefore error prone). Additionally, I want all GSEA to be calculated the same way. +First transform some data so GSEA of KEGG and Mycobrowser work. +```{r GSEA.R} +GO_CC_GSEA = run_GSEA(ranked_proteins, "GO_CC", 1, MAIN_PATH, kegg_list, mycobrowser_data) +GO_BP_GSEA = run_GSEA(ranked_proteins, "GO_BP", 1, MAIN_PATH, kegg_list, mycobrowser_data) +GO_MF_GSEA = run_GSEA(ranked_proteins, "GO_MF", 1, MAIN_PATH, kegg_list, mycobrowser_data) +KEGG_GSEA = run_GSEA(ranked_proteins, "KEGG", 1, MAIN_PATH, kegg_list, mycobrowser_data) + +MYB_FC = run_GSEA(ranked_proteins, "MYCOBROWSER_FC", 1, MAIN_PATH, kegg_list, mycobrowser_data) +MYB_FU = run_GSEA(ranked_proteins, "MYCOBROWSER_FU", 1, MAIN_PATH, kegg_list, mycobrowser_data) +MYB_PR = run_GSEA(ranked_proteins, "MYCOBROWSER_PR", 1, MAIN_PATH, kegg_list, mycobrowser_data) +``` + +## Plot GSEA results First transform some data so GSEA of KEGG and Mycobrowser work -```{r GSEA.R} -GO_CC_GSEA = run_GSEA(ranked_proteins, "GO_CC", 0.01, MAIN_PATH, kegg_list, mycobrowser_data) -GO_BP_GSEA = run_GSEA(ranked_proteins, "GO_BP", 0.01, MAIN_PATH, kegg_list, mycobrowser_data) -GO_MF_GSEA = run_GSEA(ranked_proteins, "GO_MF", 0.01, MAIN_PATH, kegg_list, mycobrowser_data) +```{r plot_GSEA.R, echo=FALSE} + +enrichplot::dotplot(GO_CC_GSEA, x="NES", orderBy="p.adjust", title="GO - Cellular Compartment") +# Display the data frame using kable +knitr::kable(GO_CC_GSEA@result[GO_CC_GSEA@result$p.adjust<=0.05,c("setSize","enrichmentScore","NES","p.adjust")]) + +enrichplot::dotplot(GO_BP_GSEA, x="NES", orderBy="p.adjust", title="GO - Biological Process") +# Display the data frame using kable +knitr::kable(GO_BP_GSEA@result[GO_BP_GSEA@result$p.adjust<=0.05,c("setSize","enrichmentScore","NES","p.adjust")]) + +enrichplot::dotplot(GO_MF_GSEA, x="NES", orderBy="p.adjust", title="GO - Molecular Function") +# Display the data frame using kable +knitr::kable(GO_MF_GSEA@result[GO_MF_GSEA@result$p.adjust<=0.05,c("setSize","enrichmentScore","NES","p.adjust")]) + +enrichplot::dotplot(MYB_FC, x="NES", orderBy="p.adjust", title="GO - Functional Category") +# Display the data frame using kable +knitr::kable(MYB_FC@result[MYB_FC@result$p.adjust<=0.05,c("setSize","enrichmentScore","NES","p.adjust")]) + +enrichplot::dotplot(MYB_FU, x="NES", orderBy="p.adjust", title="GO - Function") +# Display the data frame using kable +knitr::kable(MYB_FU@result[MYB_FU@result$p.adjust<=0.05,c("setSize","enrichmentScore","NES","p.adjust")]) + +enrichplot::dotplot(MYB_PR, x="NES", orderBy="p.adjust", title="GO - Product") +# Display the data frame using kable +knitr::kable(MYB_PR@result[MYB_PR@result$p.adjust<=0.05,c("setSize","enrichmentScore","NES","p.adjust")]) + +enrichplot::dotplot(KEGG_GSEA[["mtu"]], x="NES", orderBy="p.adjust", title = "KEGG - mtu") +# Display the data frame using kable +knitr::kable(KEGG_GSEA[["mtu"]]@result[KEGG_GSEA[["mtu"]]@result$p.adjust<=0.05,c("setSize","enrichmentScore","NES","p.adjust")]) + +enrichplot::dotplot(KEGG_GSEA[["mtv"]], x="NES", orderBy="p.adjust", title = "KEGG - mtv") +# Display the data frame using kable +knitr::kable(KEGG_GSEA[["mtv"]]@result[KEGG_GSEA[["mtv"]]@result$p.adjust<=0.05,c("setSize","enrichmentScore","NES","p.adjust")]) + +enrichplot::dotplot(KEGG_GSEA[["mtc"]], x="NES", orderBy="p.adjust", title = "KEGG - mtc") +# Display the data frame using kable +knitr::kable(KEGG_GSEA[["mtc"]]@result[KEGG_GSEA[["mtc"]]@result$p.adjust<=0.05,c("setSize","enrichmentScore","NES","p.adjust")]) -KEGG_GSEA = run_GSEA(ranked_proteins, "KEGG", 0.01, MAIN_PATH, kegg_list, mycobrowser_data) +# for(org in names(KEGG_GSEA)){ +# if(nrow(KEGG_GSEA[[org]]@result)>0){ +# enrichplot::dotplot(KEGG_GSEA[[org]], x="NES", orderBy="p.adjust", title = paste0("KEGG - ", org)) +# } +# } -MYB_FC = run_GSEA(ranked_proteins, "MYCOBROWSER_FC", 0.01, MAIN_PATH, kegg_list, mycobrowser_data) -MYB_FU = run_GSEA(ranked_proteins, "MYCOBROWSER_FU", 0.01, MAIN_PATH, kegg_list, mycobrowser_data) -MYB_PR = run_GSEA(ranked_proteins, "MYCOBROWSER_PR", 0.01, MAIN_PATH, kegg_list, mycobrowser_data) ``` ## Summarize GSEA results -```{r summarize_GSEA_makedf.R, include=TRUE} +```{r summarize_GSEA_makedf.R, include=FALSE} GSEA_overall_results = data.frame(matrix(ncol=11, nrow=0)) GSEA_overall_results_colnames = c("ID","Description","setSize","enrichmentScore","NES","pvalue","p.adjust","qvalues","rank","leading_edge","core_enrichment")#,"Category") colnames(GSEA_overall_results) = GSEA_overall_results_colnames @@ -364,7 +441,6 @@ colnames(GSEA_overall_results) = GSEA_overall_results_colnames GSEA_overall_results = GSEA_summarize(GSEA_overall_results, GO_CC_GSEA)#, "GO_CC") GSEA_overall_results = GSEA_summarize(GSEA_overall_results, GO_BP_GSEA)#, "GO_BP") GSEA_overall_results = GSEA_summarize(GSEA_overall_results, GO_MF_GSEA)#, "GO_MF") -print(GSEA_overall_results) for(name in names(KEGG_GSEA)){ GSEA_overall_results = GSEA_summarize(GSEA_overall_results, KEGG_GSEA[[name]])} #, paste("KEGG_",name,sep=""))} @@ -375,35 +451,35 @@ GSEA_overall_results = GSEA_summarize(GSEA_overall_results, MYB_PR)#, "MYB_PR") row.names(GSEA_overall_results) = NULL -write.csv(GSEA_overall_results, paste(MAIN_PATH, "output_files/intermed_op3.csv", sep="/"), quote = F, col.names = T, row.names = F, sep = ",") +write.table(unique(GSEA_overall_results), paste(MAIN_PATH, "output_files/intermed_GSEA_overall_results_scicore_cluster_230901.tsv", sep="/"), quote = F, col.names = T, row.names = F, sep = "|") ``` ## Write down GSEA results -```{r write_GSNEA.R, include=TRUE} -GSEA_overall_results_short = GSEA_overall_results[,c("ID","p.adjust")] +```{r write_GSNEA.R, echo=FALSE} +GSEA_overall_results_short = GSEA_overall_results[,c("ID","enrichmentScore","NES","pvalue","p.adjust","qvalues")] # Display the data frame using kable -knitr::kable(GSEA_overall_results_short) +knitr::kable(unique(GSEA_overall_results_short[order(GSEA_overall_results_short$p.adjust),])) GSEA_output_file_name = rev(unlist(strsplit(ms_annotation_file, "/")))[1] %>% gsub("annotation_file_","",.) %>% gsub(".csv","",.) %>% - paste(.,"_GSEA_all_results.csv", sep="") + paste(.,"_GSEA_all_results.tsv", sep="") -write.csv(GSEA_overall_results, paste(MAIN_PATH, "output_files",GSEA_output_file_name, sep="/"), quote = F, col.names = T, row.names = F, sep = ",") -print("table should be saved now.") +write.table(unique(GSEA_overall_results), paste(MAIN_PATH, "output_files",GSEA_output_file_name, sep="/"), quote = F, col.names = T, row.names = F, sep = "|") +print("here your current environment variables:") print(ls()) Sys.sleep(5) ``` ## Show SessionInfo -```{r writeSessionInfo.R, include=TRUE} +```{r writeSessionInfo.R, echo=FALSE} sessionInfo() ``` -# End of Pipeline Analysis. But what to do with Quick Go data? Or is pipeline using it? +# END-OF-ANALYSIS