diff --git a/R/GSEA_summarize.R b/R/GSEA_summarize.R
index a957f9355d4827d197eb34d8da2b249976bb9f95..089fd087dd7f018311e75da4ce62e43cb18fa939 100644
--- a/R/GSEA_summarize.R
+++ b/R/GSEA_summarize.R
@@ -4,16 +4,18 @@
#'
#' @param data_set_to_extend need specification
#' @param data_set_to_attach need specification
-#' @param cathegory need specification
#'
#' @return This function summarizes all results from all GSEA.
#'
-#' @import clusterProfiler
#' @export
-GSEA_summarize = function(data_set_to_extend, data_set_to_attach, cathegory){
+GSEA_summarize = function(data_set_to_extend, data_set_to_attach){ #}, category){
if(nrow(data_set_to_attach@result)!=0){
- data_set_to_attach@result$Cathegory = cathegory
- GSEA_overall_results = rbind(data_set_to_extend, c(data_set_to_attach@result))
- GSEA_overall_results <<- rbind(data_set_to_extend, c(data_set_to_attach@result))}
+ data_to_attach = as.data.frame(data_set_to_attach)
+ #data_to_attach$Category = rep(category, nrow(data_to_attach))
+ final_data = rbind(data_set_to_extend, data_to_attach)
+ } else {
+ final_data = data_set_to_extend
+ }
+ return(final_data)
}
diff --git a/R/run_GSEA_allKEGGids.R b/R/run_GSEA_allKEGGids.R
new file mode 100644
index 0000000000000000000000000000000000000000..e04881d4da857dc986951542586c76245ebc1529
--- /dev/null
+++ b/R/run_GSEA_allKEGGids.R
@@ -0,0 +1,108 @@
+#' make TERMM2GENE list for KEGG pathways
+#'
+#' This function converts the KEGG list into TERM2GENE format, which is required by the GSEA function. The kegg list contains tables for different kegg-organism IDs of M. tuberculosis. Therefore loop through each organism ID
+#'
+#' @param data_set the kegg_list (need specification)
+#'
+#' @return This function conducts GSEA using the clusterProfiler package.
+#'
+#' @export
+
+kegg_to_t2g = function(data_set){
+ # data_set (which is kegg_list) contains tables for different kegg-organism-ids
+ kegg_orgids <- names(data_set)
+ # write a loop, which reshuffles the data_set, so it's useful for GSEA input
+ GSEA_kegg_list <- list()
+ for (org_id in kegg_orgids){
+ tmp_subset <- data_set[[org_id]]
+ tmp_subset$label_joined <- paste(tmp_subset$pathway_id, tmp_subset$pathway_name, sep="_")
+ GSEA_kegg_list[[org_id]] <- tmp_subset[,c("label_joined","uniprot_id")]
+ }
+ return(GSEA_kegg_list)
+ }
+
+
+
+
+#' Conduct GSEA
+#'
+#' This function conducts GSEA. You can choose if it should calculate CC, BP, MF. Make it a function, where one can choose, what entity the GSEA should be run on (GO_CC, GO_BP, GO_MF, KEGG etc.)
+#' You can check out the result table via "GSEA_results@result" (for further info check str(GSEA_results))
+#'
+#' @param data_set A data set containing the ranged proteins... need specification
+#' @param kingdom GO kingdom, must be "CC or "BP" or "MF".
+#' @param pvalCOff set the cutoff for the p-value (e.g. 0.01)
+#' @param MAIN_PATH provide main path
+#' @param kegg_list provide kegg list
+#' @param mycobrowser_data provide mycobrowser_data
+#'
+#' @return This function conducts GSEA using the clusterProfiler package.
+#'
+#' @import clusterProfiler
+#' @export
+
+
+
+run_GSEA_allKEGG_ids = function(data_set, kingdom, pvalCOff,MAIN_PATH, kegg_list,mycobrowser_data){
+ if(!(kingdom %in% c("GO_CC","GO_BP","GO_MF", "KEGG", "MYCOBROWSER_FC", "MYCOBROWSER_FU", "MYCOBROWSER_PR"))){print("WARNING, choose kingdom you want to analyze. Options:\"GO_CC\",\"GO_BP\",\"GO_MF\", \"KEGG\", \"MYCOBROWSER_FC\", \"MYCOBROWSER_FU\", \"MYCOBROWSER_PR\".")}
+ if(kingdom == "GO_CC"){
+ print("You are running GSEA on GeneOntology category \"Cellular Compartment\".")
+ load(paste(MAIN_PATH, "annotation_files", "cc_table_for_gmt.Rdata", sep="/")) # loads the data set: cc_table_for_gmt
+ return(GSEA(rev(data_set), minGSSize = 1, maxGSSize = (Inf), eps = 0, pvalueCutoff = pvalCOff, pAdjustMethod = "BH", TERM2GENE=cc_table_for_gmt, by="fgsea")) #cutoff for adjusted pvalue
+ }
+ if(kingdom == "GO_BP"){
+ print("You are running GSEA on GeneOntology category \"Biological Process\".")
+ load(paste(MAIN_PATH, "annotation_files", "bp_table_for_gmt.Rdata", sep="/")) # loads the data set: bp_table_for_gmt
+ return(GSEA(rev(data_set), minGSSize = 1, maxGSSize = (Inf), eps = 0, pvalueCutoff = pvalCOff, pAdjustMethod = "BH", TERM2GENE=bp_table_for_gmt, by="fgsea"))
+ }
+ if(kingdom == "GO_MF"){
+ print("You are running GSEA on GeneOntology category \"Molecular Function\".")
+ load(paste(MAIN_PATH, "annotation_files", "mf_table_for_gmt.Rdata", sep="/")) # loads the data set: mf_table_for_gmt
+ return(GSEA(rev(data_set), minGSSize = 1, maxGSSize = (Inf), eps = 0, pvalueCutoff = pvalCOff, pAdjustMethod = "BH", TERM2GENE=mf_table_for_gmt, by="fgsea"))
+ }
+ if(kingdom == "KEGG"){
+ print("You are running GSEA on kegg pathways. The code runs for every Mtb organism ID on KEGG separately, so the output will be a list, not a table as when running the function for GO terms.")
+ GSEA_kegg_list = kegg_to_t2g(kegg_list)
+ GSEA_kegg_result_list <- list()
+ kegg_orgids <- names(GSEA_kegg_list)
+ for (org_id in kegg_orgids){
+ message(paste("Running GSEA on comparison_",org_id," right now. Computing..."))
+ runfun <- tryCatch(
+ res_GSEA <- GSEA(rev(data_set), minGSSize = 1, maxGSSize = (Inf), eps = 0, pvalueCutoff = pvalCOff, pAdjustMethod = "BH", TERM2GENE=GSEA_kegg_list[[org_id]], by="fgsea"),
+ error = function(e) {
+ message(paste("Running GSEA on the comparison: ",org_id," ended up in error...And Now for Something Completely Different."))
+ NA
+ }
+ )
+ GSEA_kegg_result_list[[org_id]] <- res_GSEA
+ }
+ return(GSEA_kegg_result_list)
+ }
+ if(kingdom == "MYCOBROWSER_FC"){
+ print("You are running GSEA on mycobrowser category \"Functional Category\".")
+ mycobrowser_FunCat <- unique(mycobrowser_data[,c("Functional_Category","UniProt_AC")])
+ return(GSEA(rev(data_set), minGSSize = 1, maxGSSize = (Inf), eps = 0, pvalueCutoff = pvalCOff, pAdjustMethod = "BH", TERM2GENE=mycobrowser_FunCat, by="fgsea"))
+ }
+ if(kingdom == "MYCOBROWSER_FU"){
+ print("You are running GSEA on mycobrowser category \"Function\".")
+ mycobrowser_Function <- unique(mycobrowser_data[,c("Function","UniProt_AC")])
+ return(GSEA(rev(data_set), minGSSize = 1, maxGSSize = (Inf), eps = 0, pvalueCutoff = pvalCOff, pAdjustMethod = "BH", TERM2GENE=mycobrowser_Function, by="fgsea"))
+ }
+ if(kingdom == "MYCOBROWSER_PR"){
+ print("You are running GSEA on mycobrowser category \"Product\".")
+ mycobrowser_Product <- unique(mycobrowser_data[,c("Product","UniProt_AC")])
+ return(GSEA(rev(data_set), minGSSize = 1, maxGSSize = (Inf), eps = 0, pvalueCutoff = pvalCOff, pAdjustMethod = "BH", TERM2GENE=mycobrowser_Product, by="fgsea"))
+ }
+}
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/inst/rmarkdown/Mtb_Proteomics_Pipeline_worksheet.Rmd b/inst/rmarkdown/Mtb_Proteomics_Pipeline_worksheet.Rmd
index 1d5772ef5e265cfa5e845ba4d785f4c0100a4805..e7dd22e0c76086cda600dd0c977854d12a5f5069 100644
--- a/inst/rmarkdown/Mtb_Proteomics_Pipeline_worksheet.Rmd
+++ b/inst/rmarkdown/Mtb_Proteomics_Pipeline_worksheet.Rmd
@@ -15,11 +15,11 @@ knitr::opts_chunk$set(echo = TRUE,
# Proteome analysis pipeline
### Main goal
-Different input files can be specified in the R-code with the "argparse" package. In the end I want to be able to run the pipeline from a slurm script like following:
+Run the pipeline with the following code. Example for comparison of WT to S450L.
```{bash main_goal, eval=FALSE}
-R pipeline_script.R -f input_sne_file.xls -a input_file_containing_all_filepaths --git path/to/respective/git/repo -o specify_outputfolder --c datafile_with_desired_comparisons
-# or
-sbatch proteome_analysis_pipeline_script.sh -i input_file -a annotation_file -c comparison_file -d(detection_cutoff) 0.75 -o output_file_name_prefix
+sbatch --job-name JOB_NAME SLURM_SCRIPT.sh /sciCore/MAIN/PATH OUTPUTFILE_NAME EXPERIMENT_NAME ANNOTATION_FILE.csv
+e.g.:
+sbatch --job-name tbeeprotpip_WT_vs_S450L tbeeprotpip_slurm_script.sh /scicore/home/gagneux/bousel00/tb/TbX015 WT_vs_S450L_output TbX015 annotation_file_wt_vs_s450l.csv
```
### Split the work in four parts
@@ -45,21 +45,21 @@ From <https://raps-with-r.dev/project_rewrite.html> I learn that I can shorten m
There the author also suggests to work with the {target} R package, which is also introduced here <https://books.ropensci.org/targets/walkthrough.html>.
More concrete:
-* work with git, to learn it, and because it will be safer in case of messing up
-* make separate R-scripts:
- - where you define all functions. source this script in the code
- - where you load, filter and merge the data
- - QC peptide-level (can maybe write a function for QC and then decide the level in function-input)
- - QC protein-level (can maybe write a function for QC and then decide the level in function-input)
- - Enrichment Analysis
-* modify for loops into apply functions, to make the script more robust (examples on <https://raps-with-r.dev/project_rewrite.html>)
-* for the script to run faster and to use less memory, make sure you unload all data-sets as soon as they are not needed in the work environment anymore (can check all loaded objects with {ls()})
-* make the script write it's own report with comments, plots and tables.
-* try to make it work with target, so independent jobs can run in parallel and you can run the script on multiple cores in parallel (e.g. the comparison-levels)
-* provide a folder in the git-folder, where you have all annotation files
-* providing the extraction-OD data, I can also further test for correlation between CV and variability between extraction-ODs of the strain, and add this to the QC
-* write in report what package versions were used and what annotation data files (as they contain the date)
-* maybe it helps when reading in the mutant matrix, to make a list of it, for easier access of elements?
+* work with git, to learn it, and because it will be safer in case of messing up [check]
+* make separate R-scripts: [check]
+ - where you define all functions. source this script in the code [check]
+ - where you load, filter and merge the data [check]
+ - QC peptide-level (can maybe write a function for QC and then decide the level in function-input) [check]
+ - QC protein-level (can maybe write a function for QC and then decide the level in function-input) [check]
+ - Enrichment Analysis [check]
+* modify for loops into apply functions, to make the script more robust (examples on <https://raps-with-r.dev/project_rewrite.html>) [no]
+* for the script to run faster and to use less memory, make sure you unload all data-sets as soon as they are not needed in the work environment anymore (can check all loaded objects with {ls()}) [to_do]
+* make the script write it's own report with comments, plots and tables. [check]
+* try to make it work with target, so independent jobs can run in parallel and you can run the script on multiple cores in parallel (e.g. the comparison-levels) [to look into]
+* provide a folder in the git-folder, where you have all annotation files [check]
+* providing the extraction-OD data, I can also further test for correlation between CV and variability between extraction-ODs of the strain, and add this to the QC [to_do]
+* write in report what package versions were used and what annotation data files (as they contain the date) [to_do]
+* maybe it helps when reading in the mutant matrix, to make a list of it, for easier access of elements? [no]
# load all packages
```{r loading_the_packages}
@@ -88,25 +88,20 @@ library("ggridges")
library("lattice") #to print pdfs from loops
```
-## Set the file paths
-Here you set all important paths. Not sure if I am going to need this script in the final more developed stage. Maybe provide all file names and paths in a separate input file
+## Load file paths
+The file paths are loaded from the command line input.
The variables are put in on the comand line, the params argument from the rmarkdown::render() function passes the values into the Rscript and they can here be accessed via params$variableINeed.
```{r loading_the_paths}
-#MAIN_PATH = "/scicore/home/gagneux/bousel00/tb/proteomics/MTB_proteome_pipeline"
#MAIN_PATH = "/scicore/home/gagneux/bousel00/tb/TbX015"
MAIN_PATH = params$MAIN_PATH
-#EXPERIMENT = "PROEVO001"
#EXPERIMENT = "TbX015"
EXPERIMENT = params$EXPERIMENT
-#spectronaut_xls_file = "/scicore/home/gagneux/bousel00/tb/proteomics/PROEVO001/spectronaut_report/20220505_155013_P462_SB_Quant1SW_20210510_Report_prottiadapted_corrected.xls"
#spectronaut_xls_file = "/scicore/home/gagneux/bousel00/tb/TbX015/spectronaut_report_files/20230804_153908_P462_SB_TbX015_directDIA_Report.tsv"
spectronaut_xls_file = params$spectronaut_xls_file
-print(spectronaut_xls_file)
-#ms_annotation_file = "/scicore/home/gagneux/bousel00/tb/proteomics/PROEVO001/files_sent_by_tom/MSstats_analysis_containingOldreport/P462_SB_Quant1SW_20210510_ConditionSetup.xls"
#ms_annotation_file = "/scicore/home/gagneux/bousel00/tb/TbX015/annotation_file_wt_vs_s450l.csv"
ms_annotation_file = paste(MAIN_PATH, params$ms_annotation_file,sep="/")
@@ -117,19 +112,13 @@ mutant_matrix = system.file("annotation_files", "PROTEOME_PIPELINE_MUTANTMATRIX.
}
```
-
-
-
-
## Load the data
-Here you load (and merge?) all data sets.
+Here you load the Spectronaut output and the annotation data.
```{r load_data_spec.R}
-# load the spectronaut outputfile
base_spectronaut_data <- read_protti(filename = spectronaut_xls_file, sep="\t")
```
```{r load_data_annot.R}
-# load the MS-annotation file...alternatively reach back to original submission file (e.g. "sb-007_INITIALsubmission.xlsx")
#The annotation data contains only information about the samples and conditions of interest.
annot <- read.csv(ms_annotation_file, header = TRUE, sep=";")
```
@@ -144,9 +133,8 @@ annot <- read.csv(ms_annotation_file, header = TRUE, sep=";")
mutantmatrix_data <- read.csv(mutant_matrix, sep=",", header=T)
```
+The function 'latest_file' will load the latest version of a file with a defined name and the date it was generated.
```{r load_data_uniprpt.R}
-
-# this part is a bit messy, maybe define function, which loads latest version of any script
# load Uniprot annotation (excluding GO terms) | see script fetch_uniprot_separate_script.r
load(latest_file(MAIN_PATH,"uniprot"))
```
@@ -158,7 +146,6 @@ mycobrowser_data <- read.csv("/scicore/home/gagneux/bousel00/tb/TbX015/annotatio
```{r load_data_kegg.R}
# load KEGG annotation
-#automatically load the latest version of the annotation file
load(latest_file(MAIN_PATH,"kegg")) # loads the data set: kegg_list
```
@@ -168,13 +155,14 @@ load(latest_file(MAIN_PATH,"kegg")) # loads the data set: kegg_list
#load(paste(MAIN_PATH, "annotation_files", "GSEA_kegg_list.Rdata", sep="/")) # loads the data set: GSEA_kegg_list
```
+TO DO: think about what to do with the QuickGO data...and where the uniprot GO data stems from.
```{r load_data_qg.R}
# load QuickGO annotation (all three domains)
quick_go_data <- read.csv("/scicore/home/gagneux/bousel00/tb/TbX015/annotation_files/QuickGO_annotations_MTBC_1773_ReviewedAndUnreviewed_20220722.tsv", header=T, sep="\t")
```
## Generate .gmt files
-Here, from the uniprot file, you generate the .gmt files, which are needed in the fgsea analysis. HOWEVER, as I'm using the clusterProfiler::GSEA and not the fgsea::fgsea function, I don't need any .gmt files.
+From the uniprot file, you generate the .gmt files, which are needed in the fgsea analysis. HOWEVER, as I'm using the clusterProfiler::GSEA and not the fgsea::fgsea function, I don't need any .gmt files.
```{r generate_gmt_files.R, eval=FALSE}
# load Uniprot-gmt files. No New Version-Selection here. These files, I generated from the uniprot data set. Together they are 23Mb, maybe I can just re-generate them every pipeline run
go_cc_gmt = make_gmt(uniprot, "go_cellular_component") # uses my defined functions
@@ -227,13 +215,16 @@ spectronaut_data <- base_spectronaut_data_reduced %>%
rm(base_spectronaut_data_reduced, annot)
```
-
-
-## Quality Control
-Here I conduct quality control of the data. First the QC.R is run on peptide level, then on protein level. Some plots are shown twice, because they first show the un-normalized data, then the normalized.
-```{r QC.R}
+## Quality Control- Peptide Level
+Here I conduct quality control of the data. First on peptide level. Some plots are shown twice, because they first show the un-normalized data, then the normalized.
+```{r pepQC.R}
strwrap("First QualityControl is run on peptide level.",prefix = "\n", initial = "")
run_QC(spectronaut_data, "peptide")
+```
+
+## Quality Control- Protein Level
+Here I conduct quality control of the data. Now on protein level. Some plots are shown twice, because they first show the un-normalized data, then the normalized.
+```{r protQC.R}
strwrap("Second QualityControl is run on protein level.",prefix = "\n", initial = "")
run_QC(spectronaut_data, "protein")
```
@@ -246,7 +237,7 @@ data_filtered_proteotypic <- spectronaut_data %>%
```
## Calculate the sequence coverage
-HErefore first annotate the data with the mutant matrix, which contains information needed for peptide type and sequence coverage.
+Here-fore, first annotate the data with the mutant matrix, which contains information needed for peptide type and sequence coverage.
```{r calculate_sequence_coverage.R}
data_filtered_uniprot <- data_filtered_proteotypic %>%
left_join(y = uniprot,
@@ -281,12 +272,11 @@ diff_abundance_data = calculate_missingness_and_DE(data_filtered_uniprot, 0.7, 0
diff_abundance_significant <- diff_abundance_data %>%
mutate(is_significant = ifelse((adj_pval < 0.01), TRUE, FALSE))
-
```
## Subset the data for GSEA
-Here the diff_abundance_data is subsetted for only columns necessary for GSEA.
+Here the diff_abundance_data is subseted for only columns necessary for GSEA.
```{r subset_for_GSEA.R}
diff_abundance_data_DEsubset <- unique(diff_abundance_data[c("pg_protein_groups","comparison","diff","t_statistic","adj_pval")])
```
@@ -302,7 +292,7 @@ pval_distribution_plot(data = diff_abundance_data,
```
## Volcano Plot
-Here a volcano plot of the comparison is drawn. The function automatically creates a plot
+Here a volcano plot of the comparison is drawn. The function automatically creates a plot.
```{r volcanoplot.R}
volcanoPlot_file_name = rev(unlist(strsplit(ms_annotation_file, "/")))[1] %>%
gsub("annotation_file_","",.) %>%
@@ -347,56 +337,51 @@ ranked_proteins = sort(ranked_proteins)
```
## Run GSEA
-One can choose to run GSEA on GO terms, KEGG pathways or Mycobrowser cathegories.
+One can choose to run GSEA on GO terms, KEGG pathways or Mycobrowser categories.
TO DO: check that none of these commands are dependent on the result of the protti_GO_analysis.R script.
There is also a protti function for the same task, however this would need more data transformation and reshuffeling (I did it but it's not very straight forward and therefore error prone).
Additionally, I want all GSEA to be calculated the same way.
First transform some data so GSEA of KEGG and Mycobrowser work
```{r GSEA.R}
-# transform data for kegg
-#GSEA_kegg_list = kegg_to_t2g(kegg_list)
-
-# transform data for mycobrowser
-#mycobrowser_FunCat <- unique(mycobrowser_data[,c("Functional_Category","UniProt_AC")])
-#mycobrowser_Function <- unique(mycobrowser_data[,c("Function","UniProt_AC")])
-#mycobrowser_Product <- unique(mycobrowser_data[,c("Product","UniProt_AC")])
-
-# run GSEA
-GO_CC_GSEA = run_GSEA(ranked_proteins, "GO_CC", 0.01,MAIN_PATH,kegg_list,mycobrowser_data)
-GO_BP_GSEA = run_GSEA(ranked_proteins, "GO_BP", 0.01,MAIN_PATH,kegg_list,mycobrowser_data)
-GO_MF_GSEA = run_GSEA(ranked_proteins, "GO_MF", 0.01,MAIN_PATH,kegg_list,mycobrowser_data)
+GO_CC_GSEA = run_GSEA(ranked_proteins, "GO_CC", 0.01, MAIN_PATH, kegg_list, mycobrowser_data)
+GO_BP_GSEA = run_GSEA(ranked_proteins, "GO_BP", 0.01, MAIN_PATH, kegg_list, mycobrowser_data)
+GO_MF_GSEA = run_GSEA(ranked_proteins, "GO_MF", 0.01, MAIN_PATH, kegg_list, mycobrowser_data)
-KEGG_GSEA = run_GSEA(ranked_proteins, "KEGG", 0.01,MAIN_PATH,kegg_list,mycobrowser_data)
+KEGG_GSEA = run_GSEA(ranked_proteins, "KEGG", 0.01, MAIN_PATH, kegg_list, mycobrowser_data)
-MYB_FC = run_GSEA(ranked_proteins, "MYCOBROWSER_FC", 0.01,MAIN_PATH,kegg_list,mycobrowser_data)
-MYB_FU = run_GSEA(ranked_proteins, "MYCOBROWSER_FU", 0.01,MAIN_PATH,kegg_list,mycobrowser_data)
-MYB_PR = run_GSEA(ranked_proteins, "MYCOBROWSER_PR", 0.01,MAIN_PATH,kegg_list,mycobrowser_data)
+MYB_FC = run_GSEA(ranked_proteins, "MYCOBROWSER_FC", 0.01, MAIN_PATH, kegg_list, mycobrowser_data)
+MYB_FU = run_GSEA(ranked_proteins, "MYCOBROWSER_FU", 0.01, MAIN_PATH, kegg_list, mycobrowser_data)
+MYB_PR = run_GSEA(ranked_proteins, "MYCOBROWSER_PR", 0.01, MAIN_PATH, kegg_list, mycobrowser_data)
```
## Summarize GSEA results
-```{r summarize_GSEA.R, include=TRUE}
-GSEA_overall_results = data.frame(matrix(ncol=12, nrow=0))
-GSEA_overall_results_colnames = c("Cathegory","ID","Description","setSize","enrichmentScore","NES","pvalue","p.adjust","qvalues","rank","leading_edge","core_enrichment")
+```{r summarize_GSEA_makedf.R, include=TRUE}
+GSEA_overall_results = data.frame(matrix(ncol=11, nrow=0))
+GSEA_overall_results_colnames = c("ID","Description","setSize","enrichmentScore","NES","pvalue","p.adjust","qvalues","rank","leading_edge","core_enrichment")#,"Category")
colnames(GSEA_overall_results) = GSEA_overall_results_colnames
-GSEA_summarize(GSEA_overall_results, GO_CC_GSEA, "GO_CC")
-GSEA_summarize(GSEA_overall_results, GO_BP_GSEA, "GO_BP")
-GSEA_summarize(GSEA_overall_results, GO_MF_GSEA, "GO_MF")
+GSEA_overall_results = GSEA_summarize(GSEA_overall_results, GO_CC_GSEA)#, "GO_CC")
+GSEA_overall_results = GSEA_summarize(GSEA_overall_results, GO_BP_GSEA)#, "GO_BP")
+GSEA_overall_results = GSEA_summarize(GSEA_overall_results, GO_MF_GSEA)#, "GO_MF")
+print(GSEA_overall_results)
for(name in names(KEGG_GSEA)){
- GSEA_summarize(GSEA_overall_results, KEGG_GSEA[[name]], paste("KEGG_",name,sep=""))}
+ GSEA_overall_results = GSEA_summarize(GSEA_overall_results, KEGG_GSEA[[name]])} #, paste("KEGG_",name,sep=""))}
-GSEA_summarize(GSEA_overall_results, MYB_FC, "MYB_FC")
-GSEA_summarize(GSEA_overall_results, MYB_FU, "MYB_FU")
-GSEA_summarize(GSEA_overall_results, MYB_PR, "MYB_PR")
+GSEA_overall_results = GSEA_summarize(GSEA_overall_results, MYB_FC)#, "MYB_FC")
+GSEA_overall_results = GSEA_summarize(GSEA_overall_results, MYB_FU)#, "MYB_FU")
+GSEA_overall_results = GSEA_summarize(GSEA_overall_results, MYB_PR)#, "MYB_PR")
+
+row.names(GSEA_overall_results) = NULL
+
+write.csv(GSEA_overall_results, paste(MAIN_PATH, "output_files/intermed_op3.csv", sep="/"), quote = F, col.names = T, row.names = F, sep = ",")
```
## Write down GSEA results
```{r write_GSNEA.R, include=TRUE}
-GSEA_overall_results_short = GSEA_overall_results[,c("ID","p.adjust","Cathegory")]
+GSEA_overall_results_short = GSEA_overall_results[,c("ID","p.adjust")]
-print(GSEA_overall_results_short)
# Display the data frame using kable
knitr::kable(GSEA_overall_results_short)
@@ -406,17 +391,17 @@ GSEA_output_file_name = rev(unlist(strsplit(ms_annotation_file, "/")))[1] %>%
gsub(".csv","",.) %>%
paste(.,"_GSEA_all_results.csv", sep="")
-print("going to save the table now.")
-print(paste(GSEA_overall_results, paste(MAIN_PATH, "output_files",GSEA_output_file_name, sep="/")))
-print(getwd())
write.csv(GSEA_overall_results, paste(MAIN_PATH, "output_files",GSEA_output_file_name, sep="/"), quote = F, col.names = T, row.names = F, sep = ",")
-write.csv(GSEA_overall_results, "test_output.csv")
print("table should be saved now.")
-
+print(ls())
Sys.sleep(5)
```
+## Show SessionInfo
+```{r writeSessionInfo.R, include=TRUE}
+sessionInfo()
+```
# End of Pipeline Analysis. But what to do with Quick Go data? Or is pipeline using it?
diff --git a/inst/rmarkdown/Mtb_Proteomics_Pipeline_worksheet_troubleshoot.Rmd b/inst/rmarkdown/Mtb_Proteomics_Pipeline_worksheet_troubleshoot.Rmd
new file mode 100644
index 0000000000000000000000000000000000000000..b65ad13965329a5df68f8b67d72e3b6cb7aa94b7
--- /dev/null
+++ b/inst/rmarkdown/Mtb_Proteomics_Pipeline_worksheet_troubleshoot.Rmd
@@ -0,0 +1,454 @@
+---
+title: "Mtb_Proteome_Analysis_Pipeline"
+author: "Selim Bouaouina"
+date: "2023-08-16"
+output:
+ html_document: default
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE,
+ warning = TRUE,
+ message = TRUE)
+```
+
+# Proteome analysis pipeline
+
+### Main goal
+Different input files can be specified in the R-code with the "argparse" package. In the end I want to be able to run the pipeline from a slurm script like following:
+```{bash main_goal, eval=FALSE}
+R pipeline_script.R -f input_sne_file.xls -a input_file_containing_all_filepaths --git path/to/respective/git/repo -o specify_outputfolder --c datafile_with_desired_comparisons
+# or
+sbatch proteome_analysis_pipeline_script.sh -i input_file -a annotation_file -c comparison_file -d(detection_cutoff) 0.75 -o output_file_name_prefix
+```
+
+### Split the work in four parts
+* Load, filter and merge data
+* QC peptide level
+* QC protein level
+* Differential expression analysis
+ - Comparison level(s), defined by the composition of the annotation file
+* Enrichment Analysis
+ - Annotations
+ - Gene Ontology
+ - MF
+ - CC
+ - BP
+ - KEGG Pathways
+ - Mycobrowser
+ - FC
+ - Function
+ - Product
+
+### Thoughts and Comments
+From <https://raps-with-r.dev/project_rewrite.html> I learn that I can shorten my script a lot and make it more easy readable and executable.
+There the author also suggests to work with the {target} R package, which is also introduced here <https://books.ropensci.org/targets/walkthrough.html>.
+
+More concrete:
+* work with git, to learn it, and because it will be safer in case of messing up
+* make separate R-scripts:
+ - where you define all functions. source this script in the code
+ - where you load, filter and merge the data
+ - QC peptide-level (can maybe write a function for QC and then decide the level in function-input)
+ - QC protein-level (can maybe write a function for QC and then decide the level in function-input)
+ - Enrichment Analysis
+* modify for loops into apply functions, to make the script more robust (examples on <https://raps-with-r.dev/project_rewrite.html>)
+* for the script to run faster and to use less memory, make sure you unload all data-sets as soon as they are not needed in the work environment anymore (can check all loaded objects with {ls()})
+* make the script write it's own report with comments, plots and tables.
+* try to make it work with target, so independent jobs can run in parallel and you can run the script on multiple cores in parallel (e.g. the comparison-levels)
+* provide a folder in the git-folder, where you have all annotation files
+* providing the extraction-OD data, I can also further test for correlation between CV and variability between extraction-ODs of the strain, and add this to the QC
+* write in report what package versions were used and what annotation data files (as they contain the date)
+* maybe it helps when reading in the mutant matrix, to make a list of it, for easier access of elements?
+
+# load all packages
+```{r loading_the_packages}
+library("protti")
+library("magrittr")
+library("dplyr") # from tidyverse
+library("tidyr") # from tidyverse
+library("ggplot2") # from tidyverse
+library("purrr") # from tidyverse
+library("gplots")
+library("ggpubr")
+library("VennDiagram")
+library("ggvenn")
+library("KEGGREST")
+library("Peptides")
+library("stringr")
+library("fgsea")
+library("ggnewscale")
+library("clusterProfiler")
+library("pathview")
+library("enrichplot")
+library("biomaRt")
+library("ggrepel")
+library("msigdbr")
+library("ggridges")
+library("lattice") #to print pdfs from loops
+```
+
+## Set the file paths
+Here you set all important paths. Not sure if I am going to need this script in the final more developed stage. Maybe provide all file names and paths in a separate input file
+The variables are put in on the comand line, the params argument from the rmarkdown::render() function passes the values into the Rscript and they can here be accessed via params$variableINeed.
+```{r loading_the_paths}
+
+#MAIN_PATH = "/scicore/home/gagneux/bousel00/tb/proteomics/MTB_proteome_pipeline"
+#MAIN_PATH = "/scicore/home/gagneux/bousel00/tb/TbX015"
+MAIN_PATH = params$MAIN_PATH
+
+#EXPERIMENT = "PROEVO001"
+#EXPERIMENT = "TbX015"
+EXPERIMENT = params$EXPERIMENT
+
+#spectronaut_xls_file = "/scicore/home/gagneux/bousel00/tb/proteomics/PROEVO001/spectronaut_report/20220505_155013_P462_SB_Quant1SW_20210510_Report_prottiadapted_corrected.xls"
+#spectronaut_xls_file = "/scicore/home/gagneux/bousel00/tb/TbX015/spectronaut_report_files/20230804_153908_P462_SB_TbX015_directDIA_Report.tsv"
+spectronaut_xls_file = params$spectronaut_xls_file
+print(spectronaut_xls_file)
+
+#ms_annotation_file = "/scicore/home/gagneux/bousel00/tb/proteomics/PROEVO001/files_sent_by_tom/MSstats_analysis_containingOldreport/P462_SB_Quant1SW_20210510_ConditionSetup.xls"
+#ms_annotation_file = "/scicore/home/gagneux/bousel00/tb/TbX015/annotation_file_wt_vs_s450l.csv"
+ms_annotation_file = paste(MAIN_PATH, params$ms_annotation_file,sep="/")
+
+#mutant_matrix = "/scicore/home/gagneux/bousel00/tb/proteomics/MTB_proteome_pipeline/annotation_files/PROTEOME_PIPELINE_MUTANTMATRIX.txt"
+mutant_matrix = system.file("annotation_files", "PROTEOME_PIPELINE_MUTANTMATRIX.txt", package="tbeeprotpip")
+ if (mutant_matrix == "") {
+ stop(paste("Could not find `PROTEOME_PIPELINE_MUTANTMATRIX.txt`. Try re-installing `tbeeprotpip`."), call. = FALSE)
+ }
+```
+
+## write up mock data
+Write up a mock data set to export
+```{r write_mock.R}
+first_name = c("Marco","Niggi","Mark","David","Jan","Lorenz","NicolasK","Nic","Simon","Philippe")
+last_name = c("Meyer","Ruettimann","Montalbo","Straumann","Wenger","Kaufmann","Kull","Mueller","Choffat","Handschin")
+year = c("28","29","29","29","29","30","30","31","29","29")
+
+GSEA_overall_results_short = data.frame(cbind(first_name, last_name, year))
+
+
+print(GSEA_overall_results_short)
+# Display the data frame using kable
+knitr::kable(GSEA_overall_results_short)
+
+
+GSEA_output_file_name = rev(unlist(strsplit(ms_annotation_file, "/")))[1] %>%
+ gsub("annotation_file_","",.) %>%
+ gsub(".csv","",.) %>%
+ paste(.,"_GSEA_all_results_klassentest.csv", sep="")
+
+print("going to save the table now.")
+print(paste(GSEA_overall_results_short, paste(MAIN_PATH, "output_files",GSEA_output_file_name, sep="/")))
+print(getwd())
+write.csv(GSEA_overall_results_short, paste(MAIN_PATH, "output_files",GSEA_output_file_name, sep="/"), quote = F, col.names = T, row.names = F, sep = ",")
+write.csv(GSEA_overall_results_short, "klassentest_output.csv")
+print("table should be saved now.")
+
+```
+
+
+
+
+## Load the data
+Here you load (and merge?) all data sets.
+```{r load_data_spec.R, eval=FALSE}
+# load the spectronaut outputfile
+base_spectronaut_data <- read_protti(filename = spectronaut_xls_file, sep="\t")
+```
+
+```{r load_data_annot.R, eval=FALSE}
+# load the MS-annotation file...alternatively reach back to original submission file (e.g. "sb-007_INITIALsubmission.xlsx")
+#The annotation data contains only information about the samples and conditions of interest.
+annot <- read.csv(ms_annotation_file, header = TRUE, sep=";")
+```
+
+```{r load_data_mutantmat.R, eval=FALSE}
+
+# load the extraction-OD data
+
+# load the Trp-protein quantification data
+
+# load the mutant matrix information
+mutantmatrix_data <- read.csv(mutant_matrix, sep=",", header=T)
+```
+
+```{r load_data_uniprpt.R, eval=FALSE}
+
+# this part is a bit messy, maybe define function, which loads latest version of any script
+# load Uniprot annotation (excluding GO terms) | see script fetch_uniprot_separate_script.r
+load(latest_file(MAIN_PATH,"uniprot"))
+```
+
+```{r load_data_mycobrow.R, eval=FALSE}
+# load Mycobrowser annotation | downloaded from mycobrowser website on 24. May 2022
+mycobrowser_data <- read.csv("/scicore/home/gagneux/bousel00/tb/TbX015/annotation_files/mycobrowser_Mycobacterium_tuberculosis_H37Rv_txt_v4.txt", header=T, sep="\t")
+```
+
+```{r load_data_kegg.R, eval=FALSE}
+# load KEGG annotation
+#automatically load the latest version of the annotation file
+load(latest_file(MAIN_PATH,"kegg")) # loads the data set: kegg_list
+
+```
+
+```{r load_gseakeggdata.R, eval=FALSE, eval=FALSE}
+# load KEGG file. No New Version-Selection here
+#load(paste(MAIN_PATH, "annotation_files", "GSEA_kegg_list.Rdata", sep="/")) # loads the data set: GSEA_kegg_list
+```
+
+```{r load_data_qg.R, eval=FALSE}
+# load QuickGO annotation (all three domains)
+quick_go_data <- read.csv("/scicore/home/gagneux/bousel00/tb/TbX015/annotation_files/QuickGO_annotations_MTBC_1773_ReviewedAndUnreviewed_20220722.tsv", header=T, sep="\t")
+```
+
+## Generate .gmt files
+Here, from the uniprot file, you generate the .gmt files, which are needed in the fgsea analysis. HOWEVER, as I'm using the clusterProfiler::GSEA and not the fgsea::fgsea function, I don't need any .gmt files.
+```{r generate_gmt_files.R, eval=FALSE}
+# load Uniprot-gmt files. No New Version-Selection here. These files, I generated from the uniprot data set. Together they are 23Mb, maybe I can just re-generate them every pipeline run
+go_cc_gmt = make_gmt(uniprot, "go_cellular_component") # uses my defined functions
+go_bp_gmt = make_gmt(uniprot, "go_biological_process") # uses my defined functions
+go_mf_gmt = make_gmt(uniprot, "go_molecular_function") # uses my defined functions
+```
+
+## Load data for GSEA
+TO DO: These data sets remain to be generated from the gmt file...WHY DID I WRITE THIS DOWN? CHECK!
+```{r load_data_for_GSEA, eval=FALSE}
+load(paste(MAIN_PATH, "annotation_files", "cc_table_for_gmt.Rdata", sep="/")) # loads the data set: cc_table_for_gmt
+load(paste(MAIN_PATH, "annotation_files", "bp_table_for_gmt.Rdata", sep="/")) # loads the data set: bp_table_for_gmt
+load(paste(MAIN_PATH, "annotation_files", "mf_table_for_gmt.Rdata", sep="/")) # loads the data set: mf_table_for_gmt
+```
+
+## Filtering the data for Quality Control (QC)
+Here I remove the "r_replicate" column, because it will be added later on with the proper replicate numbers.
+Then I remove the base_spectronaut_data from the environment to save RAM.
+Then I remove all decoy peptides and reshuffle the annot file.
+```{r filtering_for_QC.R, eval=FALSE}
+base_spectronaut_data_reduced <- subset(base_spectronaut_data, select=-c(r_replicate))
+
+rm(base_spectronaut_data)
+
+base_spectronaut_data_reduced <- base_spectronaut_data_reduced %>%
+ filter(eg_is_decoy == FALSE)
+
+# choose the relevant columns to be added to the spectronaut data
+names(annot) = c("File.Name","SampleDescription","COI","n_number")
+# rename the "File.Name" column, so it's easily merged with the spectronaut data
+colnames(annot)[colnames(annot) == "File.Name"] = "r_file_name"
+
+# re-order the data, so correct replicate number can be added
+annot <- annot[with(annot, order(COI, r_file_name)),]
+annot$r_replicate <- c(1:dim(annot)[1])
+```
+
+## Annotate Spectronaut-Data
+Here I annotate the data. As each line in "base_spectronaut_data_reduced" matches at most one line in y, choose relationship = "many-to-one". Then filter the data, so only the rows of interest are kept, menaing only the ones which got an entry from the merging with the annotation data. Then remove the two old data sets, to not overload the environment memory
+```{r annotating_with_MS_annotation.R, eval=FALSE}
+spectronaut_data <- base_spectronaut_data_reduced %>%
+ full_join(annot, by="r_file_name", relationship = "many-to-one") %>%
+ # filter rows where "Condition" is empty
+ filter(!(is.na(COI))) %>%
+ # reorder columns
+ relocate("r_file_name") %>%
+ relocate("r_condition", .after="r_file_name") %>%
+ relocate("r_replicate", .after="r_condition")
+
+rm(base_spectronaut_data_reduced, annot)
+```
+
+
+
+## Quality Control
+Here I conduct quality control of the data. First the QC.R is run on peptide level, then on protein level. Some plots are shown twice, because they first show the un-normalized data, then the normalized.
+```{r QC.R, eval=FALSE}
+strwrap("First QualityControl is run on peptide level.",prefix = "\n", initial = "")
+run_QC(spectronaut_data, "peptide")
+strwrap("Second QualityControl is run on protein level.",prefix = "\n", initial = "")
+run_QC(spectronaut_data, "protein")
+```
+
+## Filter the data for Expression Analysis
+Remove the non-proteotypic peptides.
+```{r filtering_for_EA.R, eval=FALSE}
+data_filtered_proteotypic <- spectronaut_data %>%
+ filter(pep_is_proteotypic == TRUE)
+```
+
+## Calculate the sequence coverage
+HErefore first annotate the data with the mutant matrix, which contains information needed for peptide type and sequence coverage.
+```{r calculate_sequence_coverage.R, eval=FALSE}
+data_filtered_uniprot <- data_filtered_proteotypic %>%
+ left_join(y = uniprot,
+ by = "pg_protein_accessions") %>%
+ left_join(y = mutantmatrix_data,
+ by = "n_number") %>%
+ find_peptide(protein_sequence = sequence,
+ peptide_sequence = pep_stripped_sequence) %>%
+ assign_peptide_type(aa_before = aa_before,
+ last_aa = last_aa,
+ aa_after = aa_after) %>%
+ calculate_sequence_coverage(protein_sequence = sequence,
+ peptides = pep_stripped_sequence)
+
+qc_sequence_coverage(
+ data = data_filtered_uniprot,
+ protein_identifier = pg_protein_accessions,
+ coverage = coverage
+)
+
+```
+
+## Calculate missingness and differencial abundance of proteins
+These functions from protti calculate the missingness based on adjustable thresholds and calculate the differential abundance of proteins. Furthermore a new column is added containing logicals if protein is significantly DE or not.
+To do so, the protein intensities need to be normalized first. This is already done in the QC, but not transferred from there to the global environment.
+```{r calculate_missingness_and_DE.R, eval=FALSE}
+data_filtered_uniprot$transformed_quantity_log2 = log2(data_filtered_uniprot$pg_quantity)
+data_filtered_uniprot <- normalise(data_filtered_uniprot, sample = r_file_name, intensity_log2 = transformed_quantity_log2, method = "median") # this adds the column "normalised_intensity_log2"
+data_filtered_uniprot$norm_quantity <- 2^(data_filtered_uniprot$normalised_intensity_log2)
+
+diff_abundance_data = calculate_missingness_and_DE(data_filtered_uniprot, 0.7, 0.25)
+
+diff_abundance_significant <- diff_abundance_data %>%
+ mutate(is_significant = ifelse((adj_pval < 0.01), TRUE, FALSE))
+
+```
+
+
+## Subset the data for GSEA
+Here the diff_abundance_data is subsetted for only columns necessary for GSEA.
+```{r subset_for_GSEA.R, eval=FALSE}
+diff_abundance_data_DEsubset <- unique(diff_abundance_data[c("pg_protein_groups","comparison","diff","t_statistic","adj_pval")])
+```
+
+## Calculate and plot the p-value distribution
+This can already tell something about the data.
+The non-adjusted p-values are plotted, to look for any tendencies my data might has, e.g. left peak shows that often my nullhypothesis can be rejected. The tail on the right shows level of FDR to set for minimizing false positives. The function automatically creates a plot.
+```{r pval_distribution.R, eval=FALSE}
+pval_distribution_plot(data = diff_abundance_data,
+ grouping = pg_protein_accessions, # correct?
+ pval = pval
+ )
+```
+
+## Volcano Plot
+Here a volcano plot of the comparison is drawn. The function automatically creates a plot
+```{r volcanoplot.R, eval=FALSE}
+volcanoPlot_file_name = rev(unlist(strsplit(ms_annotation_file, "/")))[1] %>%
+ gsub("annotation_file_","",.) %>%
+ gsub(".csv","",.) %>%
+ paste(.,"_volcanoPlot.pdf", sep="")
+
+graphics.off()
+pdf(paste(MAIN_PATH, "output_files", volcanoPlot_file_name, sep="/"))
+print(
+volcano_plot(
+ data = diff_abundance_data,
+ grouping = pg_protein_accessions,
+ log2FC = diff,
+ significance = pval,
+ method = "significant",
+ #target_column = pg_protein_accessions,
+ #target = "P62942",
+ #facet_scales = "fixed",
+ x_axis_label = "log2(fold change)",
+ facet_by = comparison, # all comparisons in one plot
+ significance_cutoff = c(0.05, "adj_pval")
+)
+)
+dev.off()
+```
+
+## protti-based GO analysis
+I have made this this a function, where you can choose if it should calculate CC, BP, MF and if you want to have a table or a plot returned.
+```{r protti_GO_analysis.R, eval=FALSE}
+go_cc_protti_analysis = protti_GO_analysis(diff_abundance_significant, "CC", T)
+go_cc_protti_analysis = protti_GO_analysis(diff_abundance_significant, "BP", T)
+go_cc_protti_analysis = protti_GO_analysis(diff_abundance_significant, "MF", T)
+
+```
+
+## Rank the proteins for GSEA
+PREREQUISITES: rank proteins per condition based on their t-statistics.
+```{r rank_proteins.R, eval=FALSE}
+ranked_proteins = as.numeric(diff_abundance_data_DEsubset$t_statistic)
+names(ranked_proteins) = diff_abundance_data_DEsubset$pg_protein_groups
+ranked_proteins = sort(ranked_proteins)
+```
+
+## Run GSEA
+One can choose to run GSEA on GO terms, KEGG pathways or Mycobrowser cathegories.
+TO DO: check that none of these commands are dependent on the result of the protti_GO_analysis.R script.
+There is also a protti function for the same task, however this would need more data transformation and reshuffeling (I did it but it's not very straight forward and therefore error prone).
+Additionally, I want all GSEA to be calculated the same way.
+
+First transform some data so GSEA of KEGG and Mycobrowser work
+```{r GSEA.R, eval=FALSE}
+# transform data for kegg
+#GSEA_kegg_list = kegg_to_t2g(kegg_list)
+
+# transform data for mycobrowser
+#mycobrowser_FunCat <- unique(mycobrowser_data[,c("Functional_Category","UniProt_AC")])
+#mycobrowser_Function <- unique(mycobrowser_data[,c("Function","UniProt_AC")])
+#mycobrowser_Product <- unique(mycobrowser_data[,c("Product","UniProt_AC")])
+
+# run GSEA
+GO_CC_GSEA = run_GSEA(ranked_proteins, "GO_CC", 0.01,MAIN_PATH,kegg_list,mycobrowser_data)
+GO_BP_GSEA = run_GSEA(ranked_proteins, "GO_BP", 0.01,MAIN_PATH,kegg_list,mycobrowser_data)
+GO_MF_GSEA = run_GSEA(ranked_proteins, "GO_MF", 0.01,MAIN_PATH,kegg_list,mycobrowser_data)
+
+KEGG_GSEA = run_GSEA(ranked_proteins, "KEGG", 0.01,MAIN_PATH,kegg_list,mycobrowser_data)
+
+MYB_FC = run_GSEA(ranked_proteins, "MYCOBROWSER_FC", 0.01,MAIN_PATH,kegg_list,mycobrowser_data)
+MYB_FU = run_GSEA(ranked_proteins, "MYCOBROWSER_FU", 0.01,MAIN_PATH,kegg_list,mycobrowser_data)
+MYB_PR = run_GSEA(ranked_proteins, "MYCOBROWSER_PR", 0.01,MAIN_PATH,kegg_list,mycobrowser_data)
+```
+
+## Summarize GSEA results
+```{r summarize_GSEA.R, include=TRUE, eval=FALSE}
+GSEA_overall_results = data.frame(matrix(ncol=12, nrow=0))
+GSEA_overall_results_colnames = c("Cathegory","ID","Description","setSize","enrichmentScore","NES","pvalue","p.adjust","qvalues","rank","leading_edge","core_enrichment")
+colnames(GSEA_overall_results) = GSEA_overall_results_colnames
+
+GSEA_summarize(GSEA_overall_results, GO_CC_GSEA, "GO_CC")
+GSEA_summarize(GSEA_overall_results, GO_BP_GSEA, "GO_BP")
+GSEA_summarize(GSEA_overall_results, GO_MF_GSEA, "GO_MF")
+
+for(name in names(KEGG_GSEA)){
+ GSEA_summarize(GSEA_overall_results, KEGG_GSEA[[name]], paste("KEGG_",name,sep=""))}
+
+GSEA_summarize(GSEA_overall_results, MYB_FC, "MYB_FC")
+GSEA_summarize(GSEA_overall_results, MYB_FU, "MYB_FU")
+GSEA_summarize(GSEA_overall_results, MYB_PR, "MYB_PR")
+```
+
+## Write down GSEA results
+```{r write_GSNEA.R, include=TRUE, eval=FALSE}
+GSEA_overall_results_short = GSEA_overall_results[,c("ID","p.adjust","Cathegory")]
+
+print(GSEA_overall_results_short)
+# Display the data frame using kable
+knitr::kable(GSEA_overall_results_short)
+
+
+GSEA_output_file_name = rev(unlist(strsplit(ms_annotation_file, "/")))[1] %>%
+ gsub("annotation_file_","",.) %>%
+ gsub(".csv","",.) %>%
+ paste(.,"_GSEA_all_results.csv", sep="")
+
+print("going to save the table now.")
+print(paste(GSEA_overall_results, paste(MAIN_PATH, "output_files",GSEA_output_file_name, sep="/")))
+print(getwd())
+write.csv(GSEA_overall_results, paste(MAIN_PATH, "output_files",GSEA_output_file_name, sep="/"), quote = F, col.names = T, row.names = F, sep = ",")
+write.csv(GSEA_overall_results, "test_output.csv")
+print("table should be saved now.")
+print(ls())
+Sys.sleep(5)
+```
+
+
+
+# End of Pipeline Analysis. But what to do with Quick Go data? Or is pipeline using it?
+
+
+
+
+
diff --git a/inst/rmarkdown/test_output.csv b/inst/rmarkdown/test_output.csv
new file mode 100644
index 0000000000000000000000000000000000000000..e406e0545eed4de39422b4cd5a9a2cdc37da80c1
--- /dev/null
+++ b/inst/rmarkdown/test_output.csv
@@ -0,0 +1 @@
+"","ID","Description","setSize","enrichmentScore","NES","pvalue","p.adjust","qvalues","rank","leading_edge","core_enrichment","Category"
diff --git a/inst/slurm_script/tbeeprotpip_slurm_script.sh b/inst/slurm_script/tbeeprotpip_slurm_script.sh
index 8dedbe0ca54e0f8390b2be9f5a62208f5af15b73..13fb2ea6182db54c1d3f0f851a3478545b53bfc7 100644
--- a/inst/slurm_script/tbeeprotpip_slurm_script.sh
+++ b/inst/slurm_script/tbeeprotpip_slurm_script.sh
@@ -36,4 +36,5 @@ ANNOTATION_FILE=$4
#Rscript tbeeprotpip_runfile.R -m /scicore/home/gagneux/bousel00/tb/TbX015 -f Mtb_Proteomics_Pipeline_worksheet.Rmd -o WT_vs_S450L_output -i /scicore/home/gagneux/bousel00/tb/TbX015/spectronaut_report_files/20230804_153908_P462_SB_TbX015_directDIA_Report.tsv -x TbX015 -a annotation_file_wt_vs_s450l.csv
Rscript tbeeprotpip_runfile.R -m $MAIN_PATH -f Mtb_Proteomics_Pipeline_worksheet.Rmd -o $COMPARISON_NAME -i /scicore/home/gagneux/bousel00/tb/TbX015/spectronaut_report_files/20230804_153908_P462_SB_TbX015_directDIA_Report.tsv -x $EXPERIMENT_NAME -a $ANNOTATION_FILE
+#/scicore/soft/apps/R/4.2.2-foss-2021a/bin/Rscript tbeeprotpip_runfile.R -m $MAIN_PATH -f Mtb_Proteomics_Pipeline_worksheet.Rmd -o $COMPARISON_NAME -i /scicore/home/gagneux/bousel00/tb/TbX015/spectronaut_report_files/20230804_153908_P462_SB_TbX015_directDIA_Report.tsv -x $EXPERIMENT_NAME -a $ANNOTATION_FILE
diff --git a/man/GSEA_summarize.Rd b/man/GSEA_summarize.Rd
new file mode 100644
index 0000000000000000000000000000000000000000..2fff86f2b0cc1e195117368f2d4e90f519dac784
--- /dev/null
+++ b/man/GSEA_summarize.Rd
@@ -0,0 +1,19 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/GSEA_summarize.R
+\name{GSEA_summarize}
+\alias{GSEA_summarize}
+\title{Summarize GSEA results}
+\usage{
+GSEA_summarize(data_set_to_extend, data_set_to_attach)
+}
+\arguments{
+\item{data_set_to_extend}{need specification}
+
+\item{data_set_to_attach}{need specification}
+}
+\value{
+This function summarizes all results from all GSEA.
+}
+\description{
+This function summarizes all results from all GSEA and writes the results to the global environment (see <<-)
+}
diff --git a/man/kegg_to_t2g.Rd b/man/kegg_to_t2g.Rd
index 0fd8f0e6d9bb8392be0a869722d76a0c2472c74c..867ae1000228da1329b2bc077477613a6f04c095 100644
--- a/man/kegg_to_t2g.Rd
+++ b/man/kegg_to_t2g.Rd
@@ -1,17 +1,23 @@
% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/run_GSEA.R
+% Please edit documentation in R/run_GSEA.R, R/run_GSEA_allKEGGids.R
\name{kegg_to_t2g}
\alias{kegg_to_t2g}
\title{make TERMM2GENE list for KEGG pathways}
\usage{
+kegg_to_t2g(data_set)
+
kegg_to_t2g(data_set)
}
\arguments{
\item{data_set}{the kegg_list (need specification)}
}
\value{
+This function conducts GSEA using the clusterProfiler package.
+
This function conducts GSEA using the clusterProfiler package.
}
\description{
+This function converts the KEGG list into TERM2GENE format, which is required by the GSEA function. The kegg list contains tables for different kegg-organism IDs of M. tuberculosis. Therefore loop through each organism ID
+
This function converts the KEGG list into TERM2GENE format, which is required by the GSEA function. The kegg list contains tables for different kegg-organism IDs of M. tuberculosis. Therefore loop through each organism ID
}
diff --git a/man/run_GSEA.Rd b/man/run_GSEA.Rd
index 3058328ce5b4343c1ba9d7a3d59cdf97ce03c0a1..836421ab7266934df839460135f1eb20bc272321 100644
--- a/man/run_GSEA.Rd
+++ b/man/run_GSEA.Rd
@@ -4,7 +4,7 @@
\alias{run_GSEA}
\title{Conduct GSEA}
\usage{
-run_GSEA(data_set, kingdom, pvalCOff, list_for_kegg)
+run_GSEA(data_set, kingdom, pvalCOff, MAIN_PATH, kegg_list, mycobrowser_data)
}
\arguments{
\item{data_set}{A data set containing the ranged proteins... need specification}
@@ -12,6 +12,12 @@ run_GSEA(data_set, kingdom, pvalCOff, list_for_kegg)
\item{kingdom}{GO kingdom, must be "CC or "BP" or "MF".}
\item{pvalCOff}{set the cutoff for the p-value (e.g. 0.01)}
+
+\item{MAIN_PATH}{provide main path}
+
+\item{kegg_list}{provide kegg list}
+
+\item{mycobrowser_data}{provide mycobrowser_data}
}
\value{
This function conducts GSEA using the clusterProfiler package.
diff --git a/man/run_GSEA_allKEGG_ids.Rd b/man/run_GSEA_allKEGG_ids.Rd
new file mode 100644
index 0000000000000000000000000000000000000000..1207ed33d82b49199d434510421987b74a4d192c
--- /dev/null
+++ b/man/run_GSEA_allKEGG_ids.Rd
@@ -0,0 +1,35 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/run_GSEA_allKEGGids.R
+\name{run_GSEA_allKEGG_ids}
+\alias{run_GSEA_allKEGG_ids}
+\title{Conduct GSEA}
+\usage{
+run_GSEA_allKEGG_ids(
+ data_set,
+ kingdom,
+ pvalCOff,
+ MAIN_PATH,
+ kegg_list,
+ mycobrowser_data
+)
+}
+\arguments{
+\item{data_set}{A data set containing the ranged proteins... need specification}
+
+\item{kingdom}{GO kingdom, must be "CC or "BP" or "MF".}
+
+\item{pvalCOff}{set the cutoff for the p-value (e.g. 0.01)}
+
+\item{MAIN_PATH}{provide main path}
+
+\item{kegg_list}{provide kegg list}
+
+\item{mycobrowser_data}{provide mycobrowser_data}
+}
+\value{
+This function conducts GSEA using the clusterProfiler package.
+}
+\description{
+This function conducts GSEA. You can choose if it should calculate CC, BP, MF. Make it a function, where one can choose, what entity the GSEA should be run on (GO_CC, GO_BP, GO_MF, KEGG etc.)
+You can check out the result table via "GSEA_results@result" (for further info check str(GSEA_results))
+}
diff --git a/man/write_pdf_report.Rd b/man/write_pdf_report.Rd
new file mode 100644
index 0000000000000000000000000000000000000000..e1cf397dd39fc96de09554e960e930750fb67aa8
--- /dev/null
+++ b/man/write_pdf_report.Rd
@@ -0,0 +1,23 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/write_pdf_report.R
+\name{write_pdf_report}
+\alias{write_pdf_report}
+\title{Generate a PDF report using Rmarkdown.}
+\usage{
+write_pdf_report(report_dir, rmd_fn, report_fn, rmd_params = list())
+}
+\arguments{
+\item{report_dir}{Path to the folder where the generated PDF Rmarkdown report is stored}
+
+\item{rmd_fn}{Filename of the Rmarkdown file}
+
+\item{report_fn}{Filename of the Rmarkdown rendered report}
+
+\item{rmd_params}{List of parameters}
+}
+\description{
+This function generates a standardised Rmarkdown report in a PDF format. It is adapted from https://github.com/SwissTPH/timci/
+Write e.g. write_pdf_report("/scicore/home/gagneux/bousel00/tb/TbX015",
+ "Mtb_Proteomics_Pipeline_worksheet.Rmd",
+ "S450L_vs_wt_output")
+}