Skip to content
Snippets Groups Projects
Commit eb40dfb9 authored by Flavio Lombardo's avatar Flavio Lombardo :goal:
Browse files

Testing new functions locally

parent 47b36959
No related branches found
No related tags found
No related merge requests found
Showing
with 1499 additions and 1110 deletions
*.html
---
name: Bug report or feature request
about: Describe a bug you've seen or make a case for a new feature
---
Please briefly describe your problem and what output you expect. If you have a question, please don't use this form. Instead, ask on <https://stackoverflow.com/> or <https://community.rstudio.com/>.
Please include a minimal reproducible example (AKA a reprex). If you've never heard of a [reprex](http://reprex.tidyverse.org/) before, start by reading <https://www.tidyverse.org/help/#reprex>.
For more advice on how to write a great issue, see <https://code-review.tidyverse.org/issues/>.
Brief description of the problem
```r
# insert reprex here
```
......@@ -2,3 +2,9 @@
.Rhistory
.RData
.Ruserdata
.Rprofile
*.txt
*.pdf
*.html
figures/**
temp/**
......@@ -3,7 +3,8 @@ Title: Get Data From QuPath For Data Analysis
Version: 0.1.0
BugReports: https://git.scicore.unibas.ch/ovca-research/DRUGSENS/issues
Authors@R: c(
person("Flavio", "Lombardo", "C.", "flavio.lombardo@unibas.ch", role = c("aut", "cre", "cph")),
person("Flavio", "Lombardo", , "flavio.lombardo@unibas.ch", role = c("aut", "cre", "cph")),
person("Ricardo Coelho", role = c("cph")),
person("Ovarian Cancer Research", role = c("cph")),
person("University of Basel and University Hospital Basel", role = c("cph"))
)
......@@ -15,12 +16,14 @@ License: MIT + file LICENSE
Imports:
dplyr,
tidyr,
readr,
stringr,
knitr,
testthat,
ggplot2,
ggpubr,
roxygen2
roxygen2,
tidyselect,
testthat
Depends:
R (>= 4.2)
VignetteBuilder:
......
......@@ -4,6 +4,7 @@ export(change_data_format_to_longer)
export(data_binding)
export(generate_qupath_script)
export(get_QC_plots)
export(get_QC_plots_parsed_merged_data)
export(make_count_dataframe)
export(make_run_config)
import(ggplot2)
......@@ -12,5 +13,8 @@ import(knitr)
import(testthat)
importFrom(dplyr,filter)
importFrom(dplyr,select)
importFrom(readr,write_excel_csv)
importFrom(stringr,str_count)
importFrom(stringr,str_extract)
importFrom(tidyr,pivot_longer)
importFrom(tidyselect,any_of)
......@@ -3,6 +3,7 @@
#' This function gets the count data data.frame, that has a wider format and it returns a longer-formatted data.frame
#' @importFrom tidyr pivot_longer
#' @importFrom dplyr select
#' @importFrom tidyselect any_of
#' @return A `dataframe`/`tibble`.
#' @param .data The markers count dataframe that is coming from the processing of the microscopy data
#' @param pattern_column_markers The markers' pattern name to obtain the column with ratios of the markers (it defaults to "_ratio_of_total_cells")
......@@ -21,30 +22,49 @@ change_data_format_to_longer <- function(.data,
# names of the columns
col_names_of_markers <- colnames(.data)[which(grepl(x = colnames(.data), pattern = pattern_column_markers))]
if (additional_columns){
additional_columns_to_use <- c("Treatment", "PID", "Image_number", "Tissue", "Concentration", "DOC", "Treatment_complete", "ReplicaOrNot")
if (additional_columns) {
additional_columns_to_use <- c(
"PID",
"Date",
"DOC",
"Tissue",
"Image_number",
"Treatment",
"Concentration1",
"Concentration2",
"ConcentrationUnits1",
"ConcentrationUnits2",
"ReplicaOrNot",
"Treatment_complete"
)
} else {
additional_columns_to_use <- NULL
}
if (length(col_names_of_markers) < 1) stop(paste0("Failed to find pattern: ", pattern_column_markers, " in the columnames"))
if (!all(additional_columns_to_use %in% colnames(.data))) stop(paste0('One or more of the following columnames:
c(Treatment", "PID", "Image_number", "Tissue", "Concentration", "DOC") could not be found.
Please check the names of your data.frame and/or provide your selection'),
"Those are the colnames found in the input data: ",
colnames(.data))
# if (!all(additional_columns_to_use %in% colnames(.data))) stop(paste0('One or more of the following columnames:
# c(Treatment", "PID", "Image_number", "Tissue", "Concentration", "DOC") could not be found.
# Please check the names of your data.frame and/or provide your selection'),
# "Those are the colnames found in the input data: ",
# colnames(.data))
if (!"Image_number" %in% additional_columns_to_use) stop("Image_number has to be in the dataframe.")
if (!"Treatment_complete" %in% additional_columns_to_use) stop("Treatment_complete has to be in the dataframe.")
longer_format <- .data |>
select(unique_name_row_identifier, col_names_of_markers, additional_columns_to_use) |>
pivot_longer(cols = c(col_names_of_markers),
select(any_of(c(
unique_name_row_identifier,
col_names_of_markers,
additional_columns_to_use
))) |>
pivot_longer(
cols = c(col_names_of_markers),
names_to = "marker_positivity",
values_to = "marker_positivity_ratio"
)
return(
longer_format
)
return(
longer_format
)
}
......@@ -23,52 +23,42 @@ process_file <- function(file_path,
message(paste0("Reading file: ", file_path))
# Read the CSV file into a data frame
data <- read.csv(file_path, stringsAsFactors = FALSE)
.data <- read.csv(file_path, stringsAsFactors = FALSE)
extension <- sub(x = extension, pattern = "\\.", "")
# add the image name
data$Image_number <- stringr::str_extract(
string = data$Image,
pattern = "series.\\d*"
)
# extract information from the data
data$PID <- str_extract(data$Image, "[A-Z0-9]+(?=_)")
data$Tissue <- sapply(strsplit(data$Image, "_"), `[`, 2, simplify=FALSE) |> unlist()
data$Date1 <- str_extract(data$Image, "\\d{4}.\\d{2}.\\d{2}")
data$DOC <- str_extract(data$Image, "(?<=DOC)\\d{4}\\.\\d{2}\\.\\d{2}")
data$ReplicaOrNot <- ifelse(stringr::str_detect(data$Image, pattern = "Replica|Rep|rep|replica|REPLICA|REP"), "Replica", NA_character_)
data$Treatment <- str_extract(string = data$Image, pattern = "(?<=\\d{4}\\.\\d{2}\\.\\d{2}_)[A-Za-z0-9]+(?=_.+)")
data$Concentration <- str_extract(data$Image, "\\d+(?=_[un][Mm])")
data$ConcentrationUnits <- str_extract(data$Image, "[un][Mm](?=_)")
# get the name, relabelling of the markers WIP
for(nam in names(list_of_relabeling)) {
data$Name <- gsub(
x = as.character(data$Name),
# get the name, relabeling of the markers WIP
for (nam in names(list_of_relabeling)) {
.data$Name <- gsub(
x = as.character(.data$Name),
pattern = nam,
replacement = list_of_relabeling[[nam]],
ignore.case = FALSE
)
}
## create unique_identifier
data$filter_image <- paste(
data$PID,
data$Date1,
data$DOC,
data$Tissue,
data$Image_number,
data$Treatment,
data$Concentration,
data$ConcentrationUnits,
data$ReplicaOrNot,
sep = "_"
)
# parse the data with the function
.data <- string_parsing(.data)
.data$filter_image <- apply(.data, 1, function(row) {
paste(
row["PID"],
row["Date1"],
row["DOC"],
row["Tissue"],
row["Image_number"],
row["Treatment"],
row["Concentration1"],
row["Concentration2"],
row["ConcentrationUnits1"],
row["ConcentrationUnits2"],
row["ReplicaOrNot"],
row["Treatment_complete"],
collapse = "_",
sep = "_"
)
})
return(data)
return(.data)
}
#' Merge all the dataframes coming out from the QuPath
......@@ -76,24 +66,21 @@ process_file <- function(file_path,
#' This function try to guess the string patterns that are in the dataset and then fill the dataframe
#' with that information. Finally the data is combined and combined them into one file
#' @import knitr
#' @import testthat
#' @importFrom stringr str_extract
#' @return A `dataframe`/`tibble`.
#' @param path_to_the_projects_folder The path where the files coming out of QuPath are located
#' @param files_extension_to_look_for The extension of the file outputted from QuPath
#' @param path_to_the_projects_folder String/Path The path where the files coming out of QuPath are located
#' @param files_extension_to_look_for String The extension of the file outputted from QuPath, (default is "csv")
#' @param recursive_search Boolean, it defined the behavior of the file search, if recursive or not, (default is FALSE)
#'
#' @export
#' @example
#' dataframe_output <- data_binding(path_to_the_projects_folder = "<USER_DEFINED_PATH>"
#' files_extension_to_look_for = "csv")
#'#This will return the dataframe of all the data in the folder
#' #This will return the dataframe of all the data in the folder
# Main function to bind data from multiple files
data_binding <- function(path_to_the_projects_folder,
files_extension_to_look_for,
recursive_search = FALSE
) {
files_extension_to_look_for = "csv",
recursive_search = FALSE) {
# run configuration file
make_run_config()
......@@ -111,21 +98,21 @@ data_binding <- function(path_to_the_projects_folder,
}
# List all files with the specified extension in the given folder
list_csv_files <- list_all_files(path_to_the_projects_folder,
files_extension_to_look_for,
recursive_search)
list_csv_files <- list_all_files(
path_to_the_projects_folder,
files_extension_to_look_for,
recursive_search
)
# Process each file and combine the results
df_list <- lapply(list_csv_files,
process_file,
# relabeling_map = use_custom_column_names,
files_extension_to_look_for)
df_list <- lapply(
list_csv_files,
process_file,
files_extension_to_look_for
)
combined_df <- do.call(rbind, df_list)
# # remove namings
# rm(list_csv_files, col_names_qupath_output_files)
# Return the combined dataframe
return(combined_df)
}
......@@ -14,8 +14,6 @@ generate_qupath_script <- function() {
x = paste0('
//This code script was tested with QuPath 4
//This code script was tested with QuPath 4
import qupath.lib.gui.tools.MeasurementExporter
import qupath.lib.objects.PathCellObject
import qupath.lib.objects.PathDetectionObject
......@@ -42,8 +40,8 @@ def exportType = PathCellObject.class
// Choose your *full* output path
def outputPath = "<USER_DEFINED_PATH>/<PID>_<TISSUE>_',Sys.Date(),'_<SAMPLE_DOC>_<TREATMENT_INITIALS>_<CONCENTRATION>_<CONCENTRATION_UNITS>_<REPLICA_OR_NOT>_<TUMOR_MARKER>_<APOPTOTIC_MARKER>.csv"
def outputFile = new File(outputPath)
// example <USER_DEFINED_PATH>/B39_Ascites_2023.11.10_DOC2023.10.05_NIRAPARIB_1000_nM_Rep_EpCAM_Ecad_cCasp3_ QuPath will add (series 1) at the end of this line
// example <USER_DEFINED_PATH>/B39_Ascites_2023.11.10_DOC2023.10.05_NIRAPARIB_1000_nM_Rep_EpCAM_Ecad_cCasp3_(series 01).tif
// example <USER_DEFINED_PATH>/B39_Ascites_2023.11.10_DOC2023.10.05_Niraparib_1000_nM_Rep_EpCAM_Ecad_cCasp3_ QuPath will add (series 1) at the end of this line
// example <USER_DEFINED_PATH>/B39_Ascites_2023.11.10_DOC2023.10.05_Niraparib_1000_nM_Rep_EpCAM_Ecad_cCasp3_(series 01).tif
// the part EpCAM_Ecad_cCasp3_ is optional but recommended
// Create the measurementExporter and start the export
......
#' Plot some QC plots for the bound data
#' @description
#' This plot can show trends within the dataset and run some basic statistics.
#'
#' @param .data The preprocessed data (after running make_count_dataframe() and change_data_format_to_longer()) merged data.frame that should be visualized
#' @param list_of_columns_to_plot The preprocessed data (after running make_count_dataframe() and change_data_format_to_longer()) merged data.frame that should be visualized
#' @param save_plots Boolean, TRUE if plots should be saved (default is FALSE)
#' @param saving_plots_folder String indicating the folder where the plots should be stored (default is "figures")
#' @param PID_column_name String, indicating the name of the sample to subset (default is "PID")
#' @param isolate_specific_drug String, indicating if there should be a Treatment specific data subset (default is NULL)
#' @param isolate_specific_patient String, indicating a spacific sample to plot only (default is NULL)
#' @param PID_column_name String, indicating the name of the sample to subset (default is "Treatment")
#' @param save_list_of_plots Boolean, if TRUE returns a named list of all the plots ran (default is TRUE), this can be usefult to isolate specific plots
#' @param save_plots_in_patient_specific_subfolders Boolean, if TRUE the plots will be saved (if `save_plots` TRUE) in sample specific folders (default is TRUE)
#' @param fill_color_variable Boolean, String, indicating the name of the variable (discrete) to use for the plot's filling
#' @param p_height Integer, indicate the plot's height (default is 10 inches)
#' @param p_width Integer, indicate the plot's width (default is 10 inches)
#' @param drug_column_name String, indicate the column indicating the Drug/Treament (default is "Treatment")
#'
#' @import ggplot2
#' @import ggpubr
#' @importFrom readr write_excel_csv
#' @importFrom dplyr filter
#' @return A `list`/`NULL`.
#' @example
#' ui <- get_QC_plots_parsed_merged_data(bind_data, save_plots = TRUE, save_list_of_plots = TRUE)
#' @export
get_QC_plots_parsed_merged_data <- function(.data,
list_of_columns_to_plot = NULL,
save_plots = FALSE,
saving_plots_folder = "figures",
save_plots_in_patient_specific_subfolders = TRUE,
fill_color_variable = NULL,
PID_column_name = "PID",
isolate_specific_drug = NULL,
isolate_specific_patient = NULL,
drug_column_name = "Treatment",
save_list_of_plots = TRUE,
p_height = 10,
p_width = 10) {
# List where plots could be stored
list_plottos <- list()
if (!is.data.frame(.data) | nrow(.data) < 1) stop("ERROR: the data provided must be not empty of dataframe type.")
# get the number of possible plotting variables
if (is.null(list_of_columns_to_plot)) {
list_of_columns_to_plot <- colnames(.data)[which(sapply(.data, is.numeric))]
}
# check that the fill_color_variable is in the dataset and not null
if (!is.null(fill_color_variable) & !fill_color_variable %in% colnames(.data)) stop("ERROR: the fill_color_variable must be in the colum names variables.")
# if the user decides to isolate a specific sample only
if (!is.null(isolate_specific_patient)) .data <- .data[.data[[PID_column_name]] == isolate_specific_patient, ]
for (pid in unique(.data[[PID_column_name]])) {
subset_data <- .data[.data[[PID_column_name]] == pid, ]
for (i in list_of_columns_to_plot) {
# if (!isolate_specific_drug %in% subset_data[[drug_column_name]] |> unique()) stop("ERROR: The `isolate_specific_drug` is not included in the name of the availble treaments.")
if (!is.null(isolate_specific_drug)) subset_data <- subset_data[subset_data[[drug_column_name]] %in% isolate_specific_drug, ]
if (nrow(subset_data) < 1) {
print(unique(subset_data[[PID_column_name]]))
print(unique(subset_data[[drug_column_name]]))
stop("ERROR: Your filtering query has returned no observations")
}
# browser()
# Function to dynamically add layers to a ggplot object based on conditions
add_violin_layers <- function(p, fill_color_variable) {
if (!is.null(fill_color_variable)) {
p <- p + geom_violin(trim = FALSE, aes_string(fill = fill_color_variable), color = NA) +
geom_boxplot(width = 0.1, fill = "white")
} else {
p <- p + geom_violin(trim = FALSE, fill = "#A4A4A4", color = "darkred") +
geom_boxplot(width = 0.1, fill = "white")
}
return(p)
}
# Initialize ggplot
p <- ggplot(subset_data, aes(x = Name, y = log2(unlist(subset_data[[i]]))))
# Add violin and boxplot layers
p <- add_violin_layers(p, fill_color_variable)
# More layers on top
p <- p + facet_wrap(~Treatment) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(
title = colnames(subset_data[i]),
x = "Cell Markers",
y = paste0("Intensity of ", colnames(subset_data[i]), " (log2)"),
subtitle = paste0(pid, ".", isolate_specific_drug)
) +
geom_jitter(shape = 16, position = position_jitter(0.01)) +
stat_summary(geom = "crossbar", fun = mean, colour = "red", width = 0.21)
# Conditionally add to list of plots
if (save_list_of_plots) {
list_plottos[[paste0(isolate_specific_drug, ".", pid, ".", i)]] <- p
}
if (save_plots) {
if (save_plots_in_patient_specific_subfolders) {
if (!dir.exists(paste0(saving_plots_folder, "/", pid))) dir.create(paste0(saving_plots_folder, "/", pid))
ggsave(
plot = p,
filename = paste0(
paste0(saving_plots_folder, "/", pid),
"/",
Sys.Date(),
"_",
pid,
".",
isolate_specific_drug,
".",
colnames(.data[i]),
".pdf"
),
device = "pdf",
dpi = 600
)
} else {
# Saving plots in .pdf at 600 dpi
if (!dir.exists(saving_plots_folder)) dir.create(saving_plots_folder)
ggsave(
plot = p,
width = p_width,
height = p_height,
filename = paste0(
saving_plots_folder,
"/",
Sys.Date(),
"_",
pid,
".",
isolate_specific_drug,
".",
colnames(.data[i]),
".pdf"
),
device = "pdf",
dpi = 600,
)
}
message(paste0(
"plots for: ",
pid,
".",
isolate_specific_drug,
".",
colnames(.data[i]), " saved"
))
}
}
}
}
......@@ -11,8 +11,7 @@
#' make_count_dataframe(data, name_of_the_markers_column = "Name", unique_name_row_identifier = "filter_image")
# adding the image number so to identify the distribution
make_count_dataframe <- function(.data, unique_name_row_identifier = "filter_image",
name_of_the_markers_column = "Name"
) {
name_of_the_markers_column = "Name") {
counts_total <- as.data.frame.matrix(
table(.data[[unique_name_row_identifier]], .data[[name_of_the_markers_column]])
)
......@@ -23,41 +22,26 @@ make_count_dataframe <- function(.data, unique_name_row_identifier = "filter_ima
# add sum of the markers
counts_total$sum_cells <- apply(MARGIN = 1, X = counts_total[, markers_names], FUN = sum)
# # calculate the ratios
# lapply(markers_names, \(marker) {
# counts_total[[paste0(marker, "_ratio_of_total_cells2")]] <<- round(counts_total[[marker]]/counts_total[["sum_cells"]], 2)
# })
# Calculate the ratios
counts_total[paste0(markers_names, "_ratio_of_total_cells")] <-
round(counts_total[, markers_names] / counts_total[["sum_cells"]], 2)
# names of the columns
# col_names_of_markers <- colnames(counts_total)[which(grepl(x = colnames(counts_total), pattern = "_ratio_of_total_cells"))]
counts_total[[unique_name_row_identifier]] <- row.names(counts_total)
# get variables back
counts_total$PID <- sapply(strsplit(counts_total[[unique_name_row_identifier]], "_"), '[', 1)
counts_total$DOC <- sapply(strsplit(counts_total[[unique_name_row_identifier]], "_"), '[', 2)
counts_total$Date <- sapply(strsplit(counts_total[[unique_name_row_identifier]], "_"), '[', 3)
counts_total$Tissue <- sapply(strsplit(counts_total[[unique_name_row_identifier]], "_"), '[', 4)
counts_total$Image_number <- sapply(strsplit(counts_total[[unique_name_row_identifier]], "_"), '[', 5)
counts_total$Treatment <- sapply(strsplit(counts_total[[unique_name_row_identifier]], "_"), '[', 6)
counts_total$Concentration <- sapply(strsplit(counts_total[[unique_name_row_identifier]], "_"), '[', 7)
counts_total$ConcentrationUnits <- sapply(strsplit(counts_total[[unique_name_row_identifier]], "_"), '[', 8)
counts_total$ReplicaOrNot <- sapply(strsplit(counts_total[[unique_name_row_identifier]], "_"), '[', 9)
# get variables back from the filter column
counts_total$PID <- sapply(strsplit(counts_total[[unique_name_row_identifier]], "_"), "[", 1)
counts_total$Date <- sapply(strsplit(counts_total[[unique_name_row_identifier]], "_"), "[", 2)
counts_total$DOC <- sapply(strsplit(counts_total[[unique_name_row_identifier]], "_"), "[", 3)
counts_total$Tissue <- sapply(strsplit(counts_total[[unique_name_row_identifier]], "_"), "[", 4)
counts_total$Image_number <- sapply(strsplit(counts_total[[unique_name_row_identifier]], "_"), "[", 5)
counts_total$Treatment <- sapply(strsplit(counts_total[[unique_name_row_identifier]], "_"), "[", 6)
counts_total$Concentration1 <- sapply(strsplit(counts_total[[unique_name_row_identifier]], "_"), "[", 7)
counts_total$Concentration2 <- sapply(strsplit(counts_total[[unique_name_row_identifier]], "_"), "[", 8)
counts_total$ConcentrationUnits1 <- sapply(strsplit(counts_total[[unique_name_row_identifier]], "_"), "[", 9)
counts_total$ConcentrationUnits2 <- sapply(strsplit(counts_total[[unique_name_row_identifier]], "_"), "[", 10)
counts_total$ReplicaOrNot <- sapply(strsplit(counts_total[[unique_name_row_identifier]], "_"), "[", 11)
counts_total$Treatment_complete <- sapply(strsplit(counts_total[[unique_name_row_identifier]], "_"), "[", 12)
# add drug plus concentration plus units
for (i in unique(tolower(counts_total$Treatment))) {
rows <- tolower(counts_total$Treatment) == i
# Check if the current treatment is not in the specified list
if (!i %in% c("dmso", "control", "ctrl")) {
counts_total$Treatment_complete[rows] <- paste(counts_total$Treatment[rows], counts_total$Concentration[rows], counts_total$ConcentrationUnits[rows], sep = ".")
} else {
counts_total$Treatment_complete[rows] <- counts_total$Treatment[rows]
}
}
# Return the data
return(
......
......@@ -3,10 +3,11 @@
#' When this function run the first time, it will generated a config.txt file in the user working directory.
#' It will import the data config file into the use environment. This data will be used to change the column names
#' of the imported dataset and change the name of the markers that is often incorrectly exported.
#' @param overwrite_config Boolean, if TRUE the `config_DRUGSENS.txt` will be overwritten (default is FALSE)
#' @export
#' @return A `dataframe`/`tibble`.
#' @example
make_run_config <- function() {
make_run_config <- function(overwrite_config = FALSE) {
if (file.exists("config_DRUGSENS.txt")) {
tryCatch(
expr = {
......@@ -19,7 +20,8 @@ make_run_config <- function() {
Once the 'config.txt' is available re-run run_config to veryfy that the data was correctly read")
}
)
} else {
} else if (overwrite_config){
message("Overwriting config_DRUGSENS.txt")
write(
x =
(
......@@ -27,7 +29,25 @@ make_run_config <- function() {
# List of markers to relabel
list_of_relabeling =
list(
"PathCellObject" = "DAPI",
"PathCellObject" = "onlyDAPIPositve",
"cCasp3" = "cCASP3",
"E-Cadherin: cCASP3" = "E-Cadherin and cCASP3",
"EpCAM_E-Cadherin" = "E-Cadherin",
"EpCAM_E-Cadherin and cCASP3" = "E-Cadherin and cCASP3"
)'
),
file = paste0(path.expand(getwd()), "/config_DRUGSENS.txt")
)
message("config_DRUGSENS.txt has been overwritten correctly.")
} else {
write(
x =
(
'
# List of markers to relabel
list_of_relabeling =
list(
"PathCellObject" = "onlyDAPIPositve",
"cCasp3" = "cCASP3",
"E-Cadherin: cCASP3" = "E-Cadherin and cCASP3",
"EpCAM_E-Cadherin" = "E-Cadherin",
......
#' Main parsing function
#' @description
#' This function will parse the data from the Image name and will return the metadata there contained
#' The metadata will be then associated to the count file as well
#' @import knitr
#' @import testthat
#' @importFrom stringr str_extract
#' @importFrom stringr str_count
#' @return A `dataframe`/`tibble`.
#' @param .data dataframe with parsed metadata
#' @example
#' data.parsed <- string_parsing(.data)
#' #This will return the dataframe of all the data in the folder
# Main function to bind data from multiple files
string_parsing <- function(.data) {
# add the image name
.data$Image_number <- stringr::str_extract(
string = .data$Image,
pattern = "series.\\d*"
)
multiple_drugs <- list()
# Idea I could add the configuration of the relative position of the various elements of the text by providing this configuration in the config file
# in the case of more complex scenario, in the config file, we offer the possibility to manually define the parsing values for more than 3 drugs
# https://bioconductor.org/packages/release/bioc/vignettes/GSVA/inst/doc/GSVA.html
# extract information from the data
.data$PID <- sapply(strsplit(.data$Image, "_"), `[`, 1, simplify = FALSE) |> unlist()
# .data$PID <- str_extract(.data$Image, "[A-Z0-9]+(?=_)")
.data$Tissue <- sapply(strsplit(.data$Image, "_"), `[`, 2, simplify = FALSE) |> unlist()
.data$Date1 <- str_extract(.data$Image, "\\d{4}.\\d{2}.\\d{2}")
.data$DOC <- str_extract(.data$Image, "(?<=DOC)\\d{2,4}.\\d{2}.\\d{2,4}")
.data$ReplicaOrNot <- ifelse(stringr::str_detect(.data$Image, pattern = "Replica|Rep|rep|replica|REPLICA|REP"), "Replica", NA_character_)
.data$Treatment <- sapply(strsplit(.data$Image, "_"), `[`, 5, simplify = FALSE) |> unlist()
for (double_patterns in unique(.data$Treatment)) {
number_maiusc <- stringr::str_count(pattern = "[A-Z]", string = double_patterns)
if ((number_maiusc >= 2) &
(number_maiusc < nchar(double_patterns))) {
# save the double drugs
multiple_drugs[[double_patterns]] <- double_patterns
.data <- .data[.data$Treatment == double_patterns, ]
.data$Concentration1 <- sapply(strsplit(.data$Image, "_"), `[`, 6, simplify = FALSE) |> unlist()
.data$Concentration2 <- sapply(strsplit(.data$Image, "_"), `[`, 8, simplify = FALSE) |> unlist()
.data$ConcentrationUnits1 <- sapply(strsplit(.data$Image, "_"), `[`, 7, simplify = FALSE) |> unlist()
.data$ConcentrationUnits2 <- sapply(strsplit(.data$Image, "_"), `[`, 9, simplify = FALSE) |> unlist()
} else {
.data$Concentration1 <- str_extract(.data$Image, "\\d+(?=_[munp][Mm])")
.data$Concentration2 <- NA_integer_
.data$ConcentrationUnits1 <- str_extract(.data$Image, "[munp][Mm](?=_)")
.data$ConcentrationUnits2 <- NA_character_
}
}
# add drug plus concentration plus units
for (i in unique(tolower(.data$Treatment))) {
rows <- tolower(.data$Treatment) == i
# Check if the current treatment is not in the specified list
if (i %in% c("dmso", "control", "ctrl", "original")) {
.data$Treatment_complete[rows] <- .data$Treatment[rows]
} else if (i %in% tolower(names(multiple_drugs))){
.data$Treatment_complete[rows] <- paste0(.data$Treatment[rows],
.data$Concentration1[rows],
.data$ConcentrationUnits1[rows],
"-",
.data$Concentration2[rows],
.data$ConcentrationUnits2[rows]
)
} else {
.data$Treatment_complete[rows] <- paste0(.data$Treatment[rows],
.data$Concentration1[rows],
.data$ConcentrationUnits1[rows]
)
}
}
return(
.data
)
}
![](https://img.shields.io/badge/R-%3E%3D%204.0.0-blue)
# Overview
Running DRUGSENS for QuPAth script with your project Here we provide the code to run a QuPath for a reproducible example. For more detailed examples please read [QuPath Documentation](https://qupath.readthedocs.io/en/stable/). This script should be placed into scripts within QuPath. We tested this code to a previous version of QuPath.
DRUGSENS is a R-package tha allow users to automatically analyze QuPath&trade; output data from imaging analysis.
Here we include a QuPath&trade; script to run reproducible QuPath&trade;-based image analysis, and some examples on how DRUGSENS can be used. For more detailed examples of QuPath&trade; scripting please refer to [QuPath&trade;'s Documentation](https://qupath.readthedocs.io/en/stable/).
This script should be placed into scripts within QuPath&trade;. We tested this code to a previous version of QuPath&trade.
This packge is complementary to the STAR protocol: `...`
# Installation
......@@ -22,7 +25,7 @@ install.packages("devtools")
pak::pak("r-lib/devtools")
```
You can have a look at it [devtools]("https://github.com/r-lib/devtools")
You can have a look at it [devtools](https://github.com/r-lib/devtools)
# Usage
......@@ -33,16 +36,16 @@ You can also set you working directory with `setwd()`.
### QuPath script used
To make this code locally available:
To make the QuPath script locally available within the working directory, with the currents date:
``` r
library("DRUGSENS")
generate_qupath_script()
```
This function will generate a `script_for_qupath.txt` file with the code that one can copy/paste into the QuPath's script manager. All the sections that contain \<\> should be replaced with the user experimental information. The `columnsToInclude` in the script should also be user defined, depending on the markers used.
This function will generate a `script_for_qupath.txt` file with the code that one can copy/paste into the __QuPath's script manager__. All the sections that contain \<\> should be replaced with the user experimental information. The `columnsToInclude` in the script should also be user defined, depending on the markers used.
It is very important that the file naming structure QuPath's output is maintained for `DRUGSENS` to work correctly.
It is very important that the file naming structure of the QuPath's output is maintained for `DRUGSENS` to work correctly.
``` groovy
//This groovy snipped script was tested with QuPath 4
......@@ -73,9 +76,12 @@ def exportType = PathCellObject.class
// Choose your *full* output path
def outputPath = "<USER_DEFINED_PATH>/<PID>_<TISSUE>_',Sys.Date(),'_<SAMPLE_DOC>_<TREATMENT_INITIALS>_<CONCENTRATION>_<CONCENTRATION_UNITS>_<REPLICA_OR_NOT>_<TUMOR_MARKER>_<APOPTOTIC_MARKER>.csv"
def outputFile = new File(outputPath)
// example <USER_DEFINED_PATH>/B39_Ascites_2023.11.10_DOC2023.10.05_NIRAPARIB_1000_nM_Rep_EpCAM_Ecad_cCasp3_ QuPath will add (series 1) at the end of this line
// example <USER_DEFINED_PATH>/B39_Ascites_2023.11.10_DOC2023.10.05_NIRAPARIB_1000_nM_Rep_EpCAM_Ecad_cCasp3_(series 01).tif
// example <USER_DEFINED_PATH>/B39_Ascites_2023.11.10_DOC2023.10.05_Niraparib_1000_nM_Rep_EpCAM_Ecad_cCasp3_ QuPath will add (series 1, 2 ...etc) at the end of this line, to indicate the stack number
// example <USER_DEFINED_PATH>/B39_Ascites_2023.11.10_DOC2023.10.05_Niraparib_1000_nM_Rep_EpCAM_Ecad_cCasp3_(series 01).tif
//"PID.001_Ascites_2023-11-25_DOC2020-12-14_CarboplatinPaclitaxel_100_uM_10_nM_Ecad_cCasp3_(series 01).tif"
//"A8759_Spleen_2020.11.10_DOC2001.10.05_Compoundx34542_1000_uM_EpCAM_Ecad_cCasp3_(series 01).tif"
//"A8759_Spleen_2020.11.10_DOC2001.10.05_Compoundx34542_1000_uM_EpCAM_Ecad_cCasp3_(series 01).tif"
//"B38_Eye_2023.11.10_DOC2023.10.05_GentamicinePaclitaxel_100_uM_10_nM_EpCAM_Ecad_cCasp3_(series 01).tif"
// Create the measurementExporter and start the export
def exporter = new MeasurementExporter()
......@@ -88,19 +94,21 @@ def exporter = new MeasurementExporter()
print "Done!"
```
> 📝**NOTE**
>The column `Image` must be present in the data for DRUGSENS to parse the metadata correctly. Title style (This Is An Example) is fine, but if you have a drug combination refer to the formatting as described below [Handling drug combinations](#bind-qupath-files).
### Generate configuration file
This command will generate a `config_DRUGSENS.txt` that should be edited to include the names of the cell markers that have been used by the experimenter.
In our case we replaced `"PathCellObject"` with `"onlyDAPIPositve"`.
``` r
make_run_config()
```
Once the file `config_DRUGSENS.txt` has been modified; you can feed it back to `R`; by running the command again.
``` r
make_run_config()
```
Once the file `config_DRUGSENS.txt` has been modified; you can feed it back to `R`; this will be done automatically once you run `data_binding()`.
Now the `list_of_relabeling` should be available in the R environment and it can be used by `DRUGSENS` to work. `list_of_relabeling` is a named list that is required for relabeling the markers name, that is often not user friendly.
In case the markers naming doesn't need corrections/relabeling you can leave the `list_of_relabeling` unchanged.
In case the markers naming doesn't need corrections/relabeling you can leave the `list_of_relabeling` unchanged (but one should still check it).
> 📝**NOTE** It is recommended having no spaces and using camelCase style for the list of cell markers.
> 📝**NOTE** It is recommended having no spaces and using camelCase style for the **list of cell markers**:
>
> - Start the name with a lowercase letter.
> - Do not include spaces or underscores between words.
......@@ -109,26 +117,44 @@ In case the markers naming doesn't need corrections/relabeling you can leave the
### Explore example datasets
We present here a few mock datasets, as an example. Those can be explored from the folder
We present here a few mock datasets, as example workflow. Those files can be explored from the user's R package path:
``` r
system.file("extdata/to_merge/", package = "DRUGSENS")
```
### Bind QuPath files
At first the data is a bunch of separate files which are difficult to make sense of; therefore as first step let's bind them together into a single R dataframe! This should take very little time.
The example data can be bound together with this command:
``` r
bind_data <- data_binding(path_to_the_projects_folder = system.file("extdata/to_merge/", package = "DRUGSENS"), files_extension_to_look_for = "csv")
```
You will be now able to `View(bind_data)`. You should see all the images from the QuPath in one dataframe. This dataframe will have all the metadata parsed from the `Image` column (this is the first column defined in the in `columnsToInclude` within the `script_for_qupath.txt`).
You will be now able to `View(bind_data)`. You should see all the image stacks from the QuPath in one dataframe. This dataframe will have all the metadata parsed from the `Image` column value for every stack (image x) (this is the first column defined in the in `columnsToInclude` within the `script_for_qupath.txt`).
In this code snippets we show an example of mock data `unique(bind_data$PID)` with PIDs: `"A8759" "B36" "B37", "B38", "B39"` and tissue `"Spleen", "p.wash", "Ascites", "Eye"``. You will have all the metadata in one go and also for drug combinations!
### Counting the markers for every image
This function will take the previous step's generated dataframe and it will counts image by image for every sample the number of marker occurrences. This function will keep the metadata
> ⚠️ **WARNING**: As long as you keep the formatting as the above examples.
The dates should also be in the format **yyy-mm-dd**. For the double combinations the two drugs should be wrote together with each of the different drug capilized (**C**arboplatin**P**aclitaxel) and the rest lowercased letters.
For example **CarboplatinPaclitaxel_100_uM_10_nM**. This indicates a drug combination of Carboplatin 100_uM and Paclitaxel 10_nM. Each drug amount and each unit should always be separated by `_`. The first 100_uM belongs to the Carboplatin and the 10_nM belongs to the Paclitaxel. Those constrains are due to the parsing of the strings into useful metadata. If some of the data is not present, you can use a `.` separated by `_`. If you need additional data parsing, please let us know by filing an issue on GitLab [GitLab Issue]("https://git.scicore.unibas.ch/ovca-research/DRUGSENS/issues").
### Counting the number of positiive cells for each marker in every image
This function will take the previous step's generated dataframe and it will counts image by image (sum the markers of every stack) for every sample the number of marker occurrences.
``` r
counts_dataframe <- make_count_dataframe(bind_data)
```
### Some plotting
This function will take the previous step's generated dataframe (`bind_data`) and it will generate some plots for every cell marker and for some key features from the QuPath metadata, such as nucleus area of the markers and the mean of expression per maker. The script will generate separate folders for each PID in the dataset. There might be quite some plots, therefore you can isolate specific PID or specific treatment, if that is required. With `fill_color_variable` different variables present in the metadata can be tested to visualize the data.
``` r
get_QC_plots_parsed_merged_data(bind_data,
fill_color_variable = "Tissue",
save_plots = TRUE)
# OR
get_QC_plots_parsed_merged_data(bind_data,
fill_color_variable = "Treatment_complete",
save_plots = TRUE)
```
<img src="assets/QC_plot3.png" alt="QC Plot example" title="QC Plot example" width="500" height="500"/>
<br>
### Making plotting-ready data
This function will change the wider format into longer format keeping all the metadata
``` r
......@@ -155,7 +181,8 @@ That follows the structure suggested in the QuPath script
```
"<USER_DEFINED_PATH>/<PID>_<TISSUE>_',Sys.Date(),'_<SAMPLE_DOC>_<TREATMENT_INITIALS>_<CONCENTRATION>_<CONCENTRATION_UNITS>_<REPLICA_OR_NOT>_<TUMOR_MARKER>_<APOPTOTIC_MARKER>.csv"
```
> ⚠️ **WARNING**: It is highly recommended to follow the recommended naming structure to obtain the correct output
> ⚠️ **WARNING**: It is highly recommended to follow the recommended naming structure to obtain the correct output. The dates should also be in the format **yyy-mm-dd**. For the double combinations the two drugs should be wrote together with each of the different drug capilized (**C**arboplatin**P**aclitaxel) and the rest lowercased letters.
For example **CarboplatinPaclitaxel_100_uM_10_nM**. This indicates a drug combination of Carboplatin 100_uM and Paclitaxel 10_nM. Each drug amount and each unit should always be separated by `_`. The first 100_uM belongs to the Carboplatin and the 10_nM belongs to the Paclitaxel. Those constrains are due to the parsing of the string into useful metadata. If some of the data is not present, you can use a `.` separated by `_`.
### Data Binding and Processing
......@@ -173,22 +200,41 @@ bind_data <- data_binding(path_to_the_projects_folder = defined_path,
files_extension_to_look_for = desired_file_extensions, recursive_search = FALSE)
```
> 📝**NOTE**It is recommended to run `data_binding()` with `recursive_search = FALSE` in the case that the target folder has subfolders that belong to other projects that use other cell markers.
Each file is read, and additional metadata is extracted. This will return a dataframe of all the csv files within the folder merged with some additional parsing, the metadata is parsed from the file name will be retrieved and appended to the data. Metadata such as:
- **PID** = A unique identifier assigned to each sample. This ID helps in distinguishing and tracking individual samples' data throughout the experiment.
- **Date1** = The date on which the experiment or analysis was conducted. This field records when the data was generated or processed.
- **DOC** = The date when the biological sample was collected.
- **Tissue** = Indicates the type of tissue from which the sample was derived. This could be a specific organ or cell type
- **Image_number** = Represents the order or sequence number of the image in a stack of images
- **Treatment** = The name or type of drug treatment applied to the sample
- **Concentration** = The amount of the drug treatment applied (concentration), quantitatively described.
- **ConcentrationUnits** = The units in which the drug concentration is measured, such as micromolar (uM) or nanomolar (nM)
- **ReplicaOrNot** = Indicates whether the sample is a replica or repeat of a previous experiment
- **Name** = The standardized name of the cell markers as defined in the `config_DRUGSENS.txt` file. This ensures consistency and accuracy in identifying and referring to specific cell markers.
- `DOC` = The date when the biological sample was collected.
- `Tissue` = Indicates the type of tissue from which the sample was derived. This could be a specific organ or cell type
- `Image_number` = Represents the order or sequence number of the image in a stack of images
- `Treatment` = The name or type of drug treatment applied to the sample
- `Concentration` = The amount of the drug treatment applied (concentration), quantitatively described.
- `ConcentrationUnits` = The units in which the drug concentration is measured, such as micromolar (uM) or nanomolar (nM)
- `ReplicaOrNot` = Indicates whether the sample is a replica or repeat of a previous experiment
- `Name` = The standardized name of the cell markers as defined in the `config_DRUGSENS.txt` file. This ensures consistency and accuracy in identifying and referring to specific cell markers.
### A first QC plot
``` r
plots <- get_QC_plots_parsed_merged_data(bind_data,
fill_color_variable = "Treatment_complete", save_plots = TRUE)
```
<img src="assets/QC_plot1.png" alt="QC Plot example" title="QC Plot example" width="500" height="500"/>
<br>
This plots shows the ratio of expressed markers in the various conditions, other variables (for `example Tissues` or `Treatment_complete`) can also be used. The data will be, by default, be saved in the current working directory in a folder called `figures` and will make subfolders for each PID present in the dataset.
- `.data`: The main dataset, expected to be a dataframe that has been preprocessed and formatted into a long format.
- `list_of_columns_to_plot`: Specifies which columns in the data should be visualized. Defaults to all numeric columns if not provided.
- `save_plots`: A boolean flag indicating whether the plots should be saved as files.
- `saving_plots_folder`: The directory where plot files will be saved, with "figures" as the default.
- `fill_color_variable`: Specifies a variable in the data to use for color-coding the plots, enhancing the visual distinction between different data groups.
- `PID_column_name`, isolate_specific_drug, isolate_specific_patient: Parameters allowing for the isolation of data based on patient ID or specific treatment, facilitating targeted analysis.
- `drug_column_name`: Defines the column that indicates the treatment or drug, with "Treatment" as the default.
- `save_list_of_plots`, `save_plots_in_patient_specific_subfolders`: Flags controlling the saving behavior of the plots, including the option to save in patient-specific subfolders for organized file management.
- `p_height`, `p_width`: Parameters to customize the height and width of the generated plots, ensuring they fit the desired visualization scale.
### Cell markers counting
`make_count_dataframe()`, is designed for processing microscopy data stored in a dataframe. It counts occurrences of different markers present in the dataset and computes additional metadata based on unique identifiers within each row.
......@@ -265,6 +311,6 @@ Renv will automatically activate and install the necessary packages as specified
</details>
### Reporting Issues
If you encounter any bugs or have suggestions for improvements, please file an issue using our [GitLab]("https://git.scicore.unibas.ch/ovca-research/DRUGSENS/issues"). Be sure to include as much information as possible to help us understand and address the issue.
If you encounter any bugs or have suggestions for improvements, please file an issue using our [GitLab Issue]("https://git.scicore.unibas.ch/ovca-research/DRUGSENS/issues"). Be sure to include as much information as possible to help us understand and address the issue.
Please make sure to file the issue in gitlab as the GitHub is a mirror repo.
assets/QC_plot1.png

821 KiB

assets/QC_plot2.png

364 KiB

assets/QC_plot3.png

253 KiB

Source diff could not be displayed: it is too large. Options to address this: view the blob.
......@@ -6,14 +6,14 @@
\usage{
data_binding(
path_to_the_projects_folder,
files_extension_to_look_for,
files_extension_to_look_for = "csv",
recursive_search = FALSE
)
}
\arguments{
\item{path_to_the_projects_folder}{The path where the files coming out of QuPath are located}
\item{path_to_the_projects_folder}{String/Path The path where the files coming out of QuPath are located}
\item{files_extension_to_look_for}{The extension of the file outputted from QuPath}
\item{files_extension_to_look_for}{String The extension of the file outputted from QuPath, (default is "csv")}
\item{recursive_search}{Boolean, it defined the behavior of the file search, if recursive or not, (default is FALSE)}
}
......
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/get_QC_plots_and_stats.R
\name{get_QC_plots_parsed_merged_data}
\alias{get_QC_plots_parsed_merged_data}
\title{Plot some QC plots for the bound data}
\usage{
get_QC_plots_parsed_merged_data(
.data,
list_of_columns_to_plot = NULL,
save_plots = FALSE,
saving_plots_folder = "figures",
save_plots_in_patient_specific_subfolders = TRUE,
fill_color_variable = NULL,
PID_column_name = "PID",
isolate_specific_drug = NULL,
isolate_specific_patient = NULL,
drug_column_name = "Treatment",
save_list_of_plots = TRUE,
p_height = 10,
p_width = 10
)
}
\arguments{
\item{.data}{The preprocessed data (after running make_count_dataframe() and change_data_format_to_longer()) merged data.frame that should be visualized}
\item{list_of_columns_to_plot}{The preprocessed data (after running make_count_dataframe() and change_data_format_to_longer()) merged data.frame that should be visualized}
\item{save_plots}{Boolean, TRUE if plots should be saved (default is FALSE)}
\item{saving_plots_folder}{String indicating the folder where the plots should be stored (default is "figures")}
\item{save_plots_in_patient_specific_subfolders}{Boolean, if TRUE the plots will be saved (if \code{save_plots} TRUE) in sample specific folders (default is TRUE)}
\item{fill_color_variable}{Boolean, String, indicating the name of the variable (discrete) to use for the plot's filling}
\item{PID_column_name}{String, indicating the name of the sample to subset (default is "Treatment")}
\item{isolate_specific_drug}{String, indicating if there should be a Treatment specific data subset (default is NULL)}
\item{isolate_specific_patient}{String, indicating a spacific sample to plot only (default is NULL)}
\item{drug_column_name}{String, indicate the column indicating the Drug/Treament (default is "Treatment")}
\item{save_list_of_plots}{Boolean, if TRUE returns a named list of all the plots ran (default is TRUE), this can be usefult to isolate specific plots}
\item{p_height}{Integer, indicate the plot's height (default is 10 inches)}
\item{p_width}{Integer, indicate the plot's width (default is 10 inches)}
}
\value{
A \code{list}/\code{NULL}.
}
\description{
This plot can show trends within the dataset and run some basic statistics.
}
......@@ -4,7 +4,10 @@
\alias{make_run_config}
\title{Generates and use a config txt file}
\usage{
make_run_config()
make_run_config(overwrite_config = FALSE)
}
\arguments{
\item{overwrite_config}{Boolean, if TRUE the \code{config_DRUGSENS.txt} will be overwritten (default is FALSE)}
}
\value{
A \code{dataframe}/\code{tibble}.
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment