Systematic benchmarking of clustering methods on complete and incomplete multi-modal data
1 Abstract
We present a comprehensive roadmap of recommendations for guiding an optimal clustering analysis of multi-modal datasets, tailored to the specific characteristics of the data. Even if clustering is one of the most common data analysis tasks on multi-modal settings, no clear guidelines exist on how to choose the best algorithm to partition a given dataset into clusters. To fill this gap, we conducted a systematic empirical benchmarking study using 20 multi-modal datasets, both fully observed and with modality-wise missing data. We evaluated 29 algorithms from multiple perspectives (ground-truth agreement, cluster stability, robustness to missing data, cluster structure and computational efficiency) using diverse and robust metrics. Our findings highlighted that IMSR and SNF deliver overall the best performance on complete datasets, while NEMO, PIMVC, and IMSR were the most effective methods when working with incomplete multi-modal data. All results have been made publicly available through an interactive website: https://mmcbench.netlify.app.
2 Setting environment
Code
if (!require('ggplot2')) install.packages('ggplot2', version="3.5.1"); library('ggplot2')
if (!require('dplyr')) install.packages('dplyr', version="1.1.4"); library('dplyr')
if (!require('gghighlight')) install.packages('gghighlight', version="0.4.1"); library('gghighlight')
if (!require('mlr3benchmark')) install.packages('mlr3benchmark', version="0.1.6"); library('mlr3benchmark')
if (!require('pals')) install.packages('pals', version="1.8"); library('pals')
if (!require('ggdist')) install.packages('ggdist', version="3.3.2"); library('ggdist')
if (!require('tidyr')) install.packages('tidyr', version="1.3.1"); library('tidyr')
if (!require('readr')) install.packages('readr', version="2.1.5"); library('readr')
if (!require('cowplot')) install.packages('cowplot', version="1.1.3"); library('cowplot')
if (!require('ggiraphExtra')) install.packages('ggiraphExtra', version="0.3.0"); library('ggiraphExtra')
if (!require('corrplot')) install.packages('corrplot', version="0.92"); library('corrplot')
if (!require('plotly')) install.packages('plotly', version="4.10.4"); library('plotly')
if (!require('htmlwidgets')) install.packages('htmlwidgets', version= "1,6,4"); library('htmlwidgets')
if (!require('reactable')) install.packages('reactable', version= "0.4.4"); library('reactable')
if (!require('svglite')) install.packages('svglite', version= "2.1.3"); library('svglite')
if (!require('PMCMRplus')) install.packages('PMCMRplus', version= "1.9.10"); library('PMCMRplus')
if (!require('igraph')) install.packages('igraph', version= "2.0.3"); library('igraph')
if (!require('ggraph')) install.packages('ggraph', version= "2.2.1"); library('ggraph')
if (!require('xtable')) install.packages('xtable', version= "1.8.4"); library('xtable')
if (!require('caTools')) install.packages('caTools', version= "1.18.3"); library('caTools')
if (!require('Rtsne')) install.packages('Rtsne', version= "0.17"); library('Rtsne')
library('tibble')
library('ggrepel')
library('reshape2')
library('rlang')
library("stringr")
library("grid")
# if (!require('rmarkdown')) install.packages('rmarkdown', dependencies =T); library('rmarkdown')3 Loading files
Code
paper_results_folder <- "paper_results"
paper_figures_folder <- "paper_figures"
incomplete_algs_metrics_file <- "incomplete_algorithms_metrics.csv"
incomplete_algs_metrics_path <- file.path(paper_results_folder, incomplete_algs_metrics_file)
complete_algs_metrics_file <- "complete_algorithms_metrics.csv"
complete_algs_metrics_path <- file.path(paper_results_folder, complete_algs_metrics_file)
uns_metrics_by_com_alg_file <- "uns_metrics_by_com_alg.csv"
uns_metrics_by_com_alg_path <- file.path(paper_results_folder, uns_metrics_by_com_alg_file)
uns_metrics_by_incom_alg_file <- "uns_metrics_by_inc_alg.csv"
uns_metrics_by_incom_alg_path <- file.path(paper_results_folder, uns_metrics_by_incom_alg_file)
incomplete_algs_rbmetrics_file <- "incomplete_algorithms_rbmetrics.csv"
incomplete_algs_rbmetrics_path <- file.path(paper_results_folder, incomplete_algs_rbmetrics_file)
complete_algs_rbmetrics_file <- "complete_algorithms_rbmetrics.csv"
complete_algs_rbmetrics_path <- file.path(paper_results_folder, complete_algs_rbmetrics_file)
alg_comparisons_file <- "alg_comparisons.csv"
alg_comparisons_path <- file.path(paper_results_folder, alg_comparisons_file)
resources_file <- "time_evaluation.csv"
resources_path <- file.path(paper_results_folder, resources_file)
dataset_table_file <- "dataset_table.csv"
dataset_table_path <- file.path(paper_results_folder, dataset_table_file)
com_results <- read_csv(complete_algs_metrics_path, show_col_types = FALSE)
incom_results <- read_csv(incomplete_algs_metrics_path, show_col_types = FALSE)
com_rbresults <- read_csv(complete_algs_rbmetrics_path, show_col_types = FALSE)
incom_rbresults <- read_csv(incomplete_algs_rbmetrics_path, show_col_types = FALSE)
incomplete_algorithms <- unique(incom_results$Algorithm)
complete_algorithms <- setdiff(unique(com_results$Algorithm),incomplete_algorithms)
uns_metrics_by_com_alg <- read_csv(uns_metrics_by_com_alg_path, show_col_types = FALSE)
uns_metrics_by_incom_alg <- read_csv(uns_metrics_by_incom_alg_path, show_col_types = FALSE)
alg_comparisons <- read_csv(alg_comparisons_path, show_col_types = FALSE)
resources_df <- read_csv(resources_path, show_col_types = FALSE)
dataset_table <- read_csv(dataset_table_path, show_col_types = FALSE)
dataset_table$Cells <- dataset_table$Samples * dataset_table[["Total features"]]
dataset_table <- rbind(
dataset_table %>%
mutate(Dataset = replace(Dataset, Dataset == "Nutrimouse", "nm_gen")) %>%
mutate(Clusters = replace(Clusters, Dataset == "nm_gen", 2)),
dataset_table[dataset_table[["Dataset"]] == "Nutrimouse",] %>%
mutate(Dataset = replace(Dataset, Dataset == "Nutrimouse", "nm_diet")) %>%
mutate(Clusters = replace(Clusters, Dataset == "nm_diet", 5)))
algorithm_families <- data.frame(
"Algorithm" = c("AJIVE", "COCA", "KMeans", "DFMF", "EEIMVC",
"GPCA", "IMSR", "jNMF", "LFIMVC", "MOFA",
"MRGCN", "MSNE", "MVCRSC", "MVSC", "NEMO",
"NMF", "Parea", "SIMCADC", "SNF", "DAIMC",
"IMSCAGL", "iNMF", "MKKMIK", "MONET_EO", "MONET", "MONET_IO", "OMVC",
"OPIMC", "OSLFIMVC", "PIMVC", "SUMO"),
"Strategy" = c("Subspace", "Ensemble", "Early-fusion", "Subspace", "Kernel",
"Subspace", "Subspace", "Subspace", "Kernel", "Subspace",
"Deep learning", "Graph", "Graph", "Graph", "Graph",
"Subspace", "Ensemble", "Graph", "Graph", "Subspace",
"Graph", "Subspace", "Kernel", "Graph", "Graph", "Graph", "Subspace",
"Subspace", "Kernel", "Subspace", "Subspace")
)
rename_var <- function(df) {
df <- df %>%
mutate(Dataset = replace(Dataset, Dataset == "forest", "Forest")) %>%
mutate(Dataset = replace(Dataset, Dataset == "derma", "Derm")) %>%
mutate(Dataset = replace(Dataset, Dataset == "buaa", "BUAA")) %>%
mutate(Dataset = replace(Dataset, Dataset == "statlog", "Statlog")) %>%
mutate(Dataset = replace(Dataset, Dataset == "prokaryotic", "Prokaryotic")) %>%
mutate(Dataset = replace(Dataset, Dataset == "wisconsin", "Wisconsin")) %>%
mutate(Dataset = replace(Dataset, Dataset == "bbcsport", "BBCSport")) %>%
mutate(Dataset = replace(Dataset, Dataset == "bdgp", "BDGP")) %>%
mutate(Dataset = replace(Dataset, Dataset == "webkb", "WebKB")) %>%
mutate(Dataset = replace(Dataset, Dataset == "metabric", "METABRIC")) %>%
mutate(Dataset = replace(Dataset, Dataset == "tcga", "TCGA")) %>%
mutate(Dataset = replace(Dataset, Dataset == "nuswide", "NUSWIDE")) %>%
mutate(Dataset = replace(Dataset, Dataset == "caltech101", "Caltech101"))
return(df)
}
com_results <- com_results %>%
left_join(algorithm_families, by = "Algorithm") %>%
rename_var()
incom_results <- incom_results %>%
rename_var()
uns_metrics_by_com_alg <- uns_metrics_by_com_alg %>%
left_join(algorithm_families, by = "Algorithm") %>%
rename_var()
uns_metrics_by_incom_alg <- uns_metrics_by_incom_alg %>%
rename_var()
com_rbresults <- com_rbresults %>%
left_join(algorithm_families, by = "Algorithm") %>%
rename_var()
incom_rbresults <- incom_rbresults %>%
rename_var()
resources_df <- resources_df %>%
mutate(
Algorithm = factor(algorithm),
Dataset = dataset,
Time = time,
Memory = memory_usage*1e-6,
Peak = peak*1e-6,
CPU = cpu_usage) %>%
rename_var() %>%
mutate(Dataset = factor(Dataset))
amputation_mechanisms <- c("All", "MEM", "PM", "MCAR", "MNAR")
unique_algorithms <- unique(com_results$Algorithm)
colors <- polychrome(length(unique_algorithms))
algorithm_colors <- setNames(colors, unique_algorithms)
# com_results$color_alg <- algorithm_colors[com_results$Algorithm]
unique_datasets <- unique(com_results$Dataset)
colors <- cols25(length(unique_datasets))
dataset_colors <- setNames(colors, unique_datasets)
# com_results$color_dat <- dataset_colors[com_results$Dataset]
marks <- seq(length(unique_algorithms))
algorithm_marks <- setNames(marks, unique_algorithms)
# com_results$mark_alg <- algorithm_marks[com_results$Algorithm]
metrics_performance <- c("MCC", "AMI", "ARI")
extraw_metrics_performance <- "_mean"
metrics_performance <- paste0(metrics_performance, extraw_metrics_performance)
metrics_stab <- c("AMI", "ARI")
extraw_metrics_stab <- "_Stab"
metrics_stab <- paste0(metrics_stab, extraw_metrics_stab)
metrics_rob <- c("AMI", "ARI")
metrics_rob <- paste0(metrics_rob, extraw_metrics_performance)
baseline <- "KMeans"4 Results
We aimed to provide a holistic assessment of clustering performance by evaluating the clustering solutions from multiple perspectives. Specific metrics were chosen for each aspect of performance, ensuring they were robust to group imbalance or the number of clusters. We evaluated the following aspects:
Ground-truth agreement. We compared the clusters identified by the algorithms with the ground-truth labels of the original groups in each dataset using the following metrics: Matthews correlation coefficient (MCC), adjusted mutual information (AMI) and adjusted rand index (ARI).
Cluster stability. To measure the clustering solution consistency, we compared the pairwise clustering results across independent runs using AMI and ARI.
Internal cluster indices. We used unsupervised quality indices that assess cluster structure, such as intrinsic compactness, connectedness, and separation of the cluster partitions, using only information inherent to the dataset. Nine indices were computed: silhouette; variance ratio criterion (VRC); density-based clustering validation index (DBCVI); Duda hart index (DHI); Dunn index; sum of squared error index (SSEI); R-squared index (RSI); Davies-Bouldin index (DB); and the ball hall index (BHI).
Cluster robustness. For incomplete data, we evaluated how closely the clusters matched to those obtained in the reference scenario with complete data with AMI and ARI.
Computational efficiency. We measured computational efficiency in terms of elapsed time and resource consumption (CPU and memory usage).
Conclusions from the results. Recommendation flowchart for choosing the best clustering algorithms based on multi-modal data characteristics and available computing resources. Limited resources refer to scenarios where certain methods are impractical due to time or hardware constraints. Symbols next to each method indicate the properties the algorithm is expected to satisfy.
4.1 Evaluation on complete data
4.1.1 Comparison with ground-truth labels
Code
results_complete <- com_results[com_results["Amputation_Mechanism"] == "Resampling",]4.1.1.1 Aggregated
Code
metric <- metrics_performance[1]
results_summary <- results_complete %>%
select(Dataset, Algorithm, all_of(metric)) %>%
group_by(Dataset) %>%
mutate(rank = rank(-!!as.name(metric))) %>%
group_by(Algorithm) %>%
summarise(
"# in Top 3 Models" = sum(rank <= 3),
"# Best" = sum(rank == 1),
"Average Rank" = mean(rank),
"Sd Rank" = sd(rank),
) %>%
arrange(-`# in Top 3 Models`, -`# Best`, `Average Rank`) %>%
mutate_if(is.numeric, round, digits= 1) %>%
left_join(results_complete %>% count(Algorithm, name = "# Datasets"), by = "Algorithm")
sorted_algs <- results_summary$Algorithm
reactable::reactable(results_summary, pagination = TRUE, sortable = TRUE, defaultPageSize = 10)Code
# print(xtable(results_summary[1:10,] %>% mutate("Average Rank" = paste0(round(`Average Rank`, 1), "\u00B1", round(`Sd Rank`, 1))) %>% select(-c(`Sd Rank`))), include.rownames=FALSE)External validation summary. Summary of performance, showing the number of times each method was within the top-3 methods, the number of times they achieved the best score, the average rank (mean\(\pm\)standard deviation), and the number of datasets evaluated.
Code
ba <- expand.grid(Dataset = unique(results_complete$Dataset),
Algorithm = unique(results_complete$Algorithm,
Amputation_Mechanism = "Resampling")) %>%
merge(results_complete, all = TRUE) %>%
select(Dataset, Algorithm, all_of(metrics_performance)) %>%
group_by(Dataset) %>%
mutate(across(where(is.numeric), ~ if_else(is.na(.), .[Algorithm == baseline], .))) %>%
mutate(across(where(is.numeric), ~ if_else(is.na(.), 0, .))) %>%
ungroup() %>%
droplevels() %>%
group_by(Algorithm) %>%
mutate(across(where(is.numeric), ~ mean(.), .names = "mean_{col}")) %>%
rename(task_id = Dataset, learner_id = Algorithm) %>%
as_benchmark_aggr()
global_significance <- ba$friedman_test(p.adjust.method = "fdr")$p.value
y_height <- -0.5
minimize <- FALSE
for (metric in metrics_performance) {
cd <- ba$.__enclos_env__$private$.crit_differences(meas = metric, minimize = minimize, baseline= baseline, test = "bd", friedman_global = F)
cdx <- as.numeric(unlist(subset(cd$data, baseline == 1, "mean_rank")))
figure <- autoplot(ba, type = "cd", meas = metric, minimize = minimize, style= 2, friedman_global = F) +
# ggtitle(paste("Metric:", metric == "MCC_mean", "Matthews correlation coefficient", ifelse(metric == "AMI_mean", "Adjusted mutual information", "Adjusted rand index")) +
# theme(plot.title = element_text(size=15, hjust = 0.5, vjust=-1)) +
scale_color_manual(values = algorithm_colors) +
annotate("segment", x = c(cdx - cd$cd, cdx + cd$cd), xend = cdx, y = y_height, yend = y_height, alpha = 0.9, color = "black", linewidth = 1.3) +
annotate("segment", x = c(cdx - cd$cd, cdx + cd$cd), xend = c(cdx - cd$cd, cdx + cd$cd), y = y_height+0.25, yend = y_height-0.25, alpha = 0.9, color = "black", linewidth = 1.3) +
annotate("text", label = paste("Critical Difference (baseline) =", round(cd$cd, 2), sep = " "),
y = y_height - 0.5, x = cdx) +
theme(text = element_text(family = "Times"))
figure$layers[[5]]$aes_params$label <- paste0(figure$layers[[5]]$aes_params$label, "; ", "Friedman test,p=", format(global_significance[[metric]], digits = 2))
figure$layers[[5]]$mapping$y <- figure$layers[[6]]$data$y + 0.4
figure$layers[[5]]$mapping$x <- mean(figure$data$mean_rank)
figure$layers[[4]]$aes_params$size <- 3
figure$data$xend <- pmin(pmax(figure$data$xend, 2), dim(figure$data)[1]-1)
figure$layers[[4]]$aes_params$colour <- "black"
figure$layers[[4]]$aes_params$hjust <- ifelse(figure$data$right, 1, 0)
filename <- paste0("rank_", gsub(extraw_metrics_performance, "", metric))
ggsave(file.path(paper_figures_folder, paste0(filename,".png")), figure, width = 20, height = 13.5, units = "cm", dpi = 300, bg = 'white')
if ((metric == metrics_performance[1]) | (metric == metrics_performance[2])) {
figure <- figure +
theme(axis.text.x = element_text(size = 14), axis.title.x = element_text(size = 18))
ggsave(file.path(paper_figures_folder, paste0(filename,".svg")), figure, bg = 'white')
extrafont::loadfonts()
ggsave(file.path(paper_figures_folder, paste0(filename,".pdf")), figure)
}
}
Critical difference plot. Critical difference plot showing the average rank of the methods. Missing results were estimated with the baseline (K-means), what may have introduced artifacts, artificially making the performance appear closer to the baseline than they actually are. Friedman global test indicates that there is a significant difference in the rankings of the methods over all the datasets.
Code
for (metric in metrics_performance) {
figure <- results_complete %>%
select(Dataset, Algorithm, all_of(metric), Strategy) %>%
group_by(Dataset) %>%
mutate("{metric}" := !!as.name(metric) / max(!!as.name(metric))) %>%
ungroup() %>%
group_by(Algorithm) %>%
mutate(across(where(is.numeric), ~ mean(.), .names = "mean_{col}")) %>%
arrange(desc(.data[[paste0("mean_", metric)]])) %>%
ungroup() %>%
mutate(Algorithm = factor(Algorithm, levels = unique(Algorithm))) %>%
ggplot(aes(x=Algorithm, y=get(metric), fill=Strategy)) +
geom_boxplot(outliers = FALSE, varwidth = TRUE) +
ylim(0, 1) +
xlab("") + ylab(paste("Normalized", ifelse(metric == "MCC_mean", "Matthews correlation coefficient", ifelse(metric == "AMI_mean", "adjusted mutual information", "adjusted rand index")))) +
theme_cowplot() +
scale_x_discrete(guide = guide_axis(angle = 60)) +
scale_fill_manual(values = c(
"Deep learning" = scales::hue_pal()(6)[1],
"Early-fusion" = scales::hue_pal()(6)[2],
"Ensemble" = scales::hue_pal()(6)[3],
"Graph" = scales::hue_pal()(6)[4],
"Kernel" = scales::hue_pal()(6)[5],
"Subspace" = scales::hue_pal()(6)[6]
)) +
geom_point(aes(x=Algorithm, y=.data[[paste0("mean_", metric)]]), size = 2, shape=2, color= "green", show.legend = FALSE) +
theme(legend.position = c(0.01, 0.14), legend.direction = "horizontal", legend.box.background = element_rect(color = "black"), legend.box.margin = margin(t = 3, r = 3, l=2), text = element_text(family = "Times")) +
guides(fill = guide_legend(nrow = 3, title.position="top"))
filename <- paste0("boxplots_", gsub(extraw_metrics_performance, "", metric))
ggsave(file.path(paper_figures_folder, paste0(filename,".png")), figure, width = 10, height = 8, bg = 'white')
if (metric == metrics_performance[1]) {
figure$data <- figure$data[figure$data$Algorithm %in% unique(figure$data$Algorithm)[1:10],]
figure <- figure +
theme(legend.position = c(0.03, 0.19), legend.text=element_text(size=12), axis.text.x = element_text(size = 15), axis.title.y = element_text(size = 15))
ggsave(file.path(paper_figures_folder, paste0(filename,".svg")), figure, bg = 'white')
ggsave(file.path(paper_figures_folder, paste0(filename,".pdf")), figure)
}
}
Soft-ranking: Normalized metrics across datasets. Boxplots show the 25th, 50th, 75th, and 100th percentiles, with the width indicating the number of datasets on which each algorithm was computed. The green triangle marks the mean score for each method. For the normalization step, the score was divided by the result of the best-performing algorithm for each dataset, ensuring the best solution always received a normalized score of 1. Although iNMF emerged as the best-performing algorithms when using a soft-ranking, this result may be influenced by the fact that the method was only evaluated on the smallest (and easiest) datasets.
4.1.1.2 Individual
Code
for (metric in metrics_performance) {
figure <- results_complete %>%
select(Dataset, Algorithm, metric) %>%
group_by(Dataset) %>%
mutate(Best = (get(metric) == max(get(metric), na.rm = TRUE))) %>%
ungroup() %>%
mutate(
Algorithm = factor(Algorithm, levels = sorted_algs),
Dataset = factor(Dataset, levels = results_complete %>%
group_by(Dataset) %>% summarize(n = n()) %>%
ungroup() %>% arrange(n) %>% pull(Dataset)),
PointType = ifelse(Best, "Best", "Other")
) %>%
ggplot(aes(x = Algorithm, y = Dataset, size = get(metric), color = PointType)) +
geom_point(alpha = 0.75) +
scale_color_manual(values = c("Other" = "lightblue", "Best" = "purple"), name = "") +
xlab("") + ylab("") +
scale_size(name = gsub(extraw_metrics_performance, "", metric), range = c(1.5, 10), breaks = c(0, 0.2, 0.4, 0.6, 0.8, 1)) +
theme_cowplot(font_family = "Times") +
scale_x_discrete(guide = guide_axis(angle = 60))
filename <- paste0("bubble_", gsub(extraw_metrics_performance, "", metric))
ggsave(file.path(paper_figures_folder, paste0(filename,".png")), figure, width = 10, height = 6, bg = 'white')
if (metric == metrics_performance[1]) {
figure <- figure +
theme(axis.text.x = element_text(size = 10), axis.text.y = element_text(size = 10))
ggsave(file.path(paper_figures_folder, paste0(filename,".svg")), figure, width = 10, height = 6, bg = 'white')
ggsave(file.path(paper_figures_folder, paste0(filename,".pdf")), figure, width = 12, height = 6)
}
}
Individual performance of the algorithms. Bubble plot where the size of each bubble represents the average performance of the algorithm on each dataset.
Code
for (metric in metrics_performance) {
normalized_results_complete <- results_complete %>%
select(Dataset, Algorithm, all_of(metric)) %>%
group_by(Dataset) %>%
mutate("{metric}" := !!as.name(metric) / max(!!as.name(metric))) %>%
ungroup() %>%
group_by(Algorithm) %>%
mutate(across(where(is.numeric), ~ mean(.), .names = "mean_{col}")) %>%
arrange(desc(.data[[paste0("mean_", metric)]])) %>%
ungroup()
alg_coms <- combs(sorted_algs, 2) %>%
as.data.frame() %>%
mutate(
Alg1 = V1,
Alg2 = V2,
p_value = NA
) %>%
ungroup() %>%
select(Alg1, Alg2, p_value)
for (pw_algs in 1:nrow(alg_coms)){
alg1 <- alg_coms[pw_algs,"Alg1"]
alg2 <- alg_coms[pw_algs,"Alg2"]
filtered_data <- normalized_results_complete %>%
filter(Algorithm %in% c(alg1, alg2)) %>%
group_by(Dataset) %>%
filter(n_distinct(Algorithm) == 2) %>%
ungroup()
if (nrow(filtered_data) > 0) {
kruskal_result <- kruskal.test(as.formula(paste(metric, "~ Algorithm")), data = filtered_data)
alg_coms[pw_algs,"p_value"] <- kruskal_result$p.value
}
}
figure <- alg_coms %>%
filter(!is.na(p_value)) %>%
mutate(
Alg1 = factor(Alg1, levels = sorted_algs[sorted_algs %in% Alg1]),
Alg2 = factor(Alg2, levels = rev(sorted_algs[sorted_algs %in% Alg2])),
p_value = round(p_value, digits = 2)
) %>%
arrange(Alg1, Alg2) %>%
ggplot(aes(x = Alg1, y = Alg2, fill = p_value)) +
geom_tile() +
geom_text(aes(label = ifelse(p_value < 0.05, ifelse(p_value <= 0.01, ifelse(p_value <= 0.001, "***", "**"), "*"), "")), color = "black", size=6) +
scale_fill_gradientn(
colors = c("red", "orange", "yellow", "white", "white"),
values = scales::rescale(c(0, 0.05, 0.1, 0.15, 1)),
limits = c(0, 1),
name = "p-value",
na.value = "grey",
guide = guide_colorbar(frame.colour = "black", ticks.colour = "black")
) +
xlab("") + ylab("") +
theme_cowplot() +
theme(axis.text.x=element_text(angle=90, hjust=1, vjust= 0.5), legend.position = c(0.8,0.8), text = element_text(family = "Times")) +
coord_cartesian(clip = "off")
filename <- paste0("diff_", gsub(extraw_metrics_performance, "", metric))
ggsave(file.path(paper_figures_folder, paste0(filename, ".png")), figure, width = 12, height = 9, bg = 'white')
if (metric == metrics_performance[1]) {
figure <- figure + theme(axis.text.x=element_text(angle=90, hjust=1, vjust= 0.5, size=13), axis.text.y=element_text(size=13), legend.position = c(0.1,0.9), plot.margin=grid::unit(c(0,0,-5,-5), "mm"))
# figure$layers[[2]]$aes_params$size <- 4.3
ggsave(file.path(paper_figures_folder, paste0(filename, ".svg")), figure, width = 12, height = 9, bg = 'white')
ggsave(file.path(paper_figures_folder, paste0(filename, ".pdf")), figure, width = 12, height = 9)
}
}
Statistical differences in scoring. Statistical differences of the metrics across methods using pairwise comparison with Kruskal-Wallis test.
Code
interesting_algs <- c("IMSR", "SNF", "NEMO", baseline)
for (metric in metrics_performance) {
figure <- results_complete %>%
select(Dataset, Algorithm, metric) %>%
filter(Algorithm %in% interesting_algs) %>%
group_by(Algorithm) %>%
mutate(across(where(is.numeric), ~ mean(.), .names = "mean_{col}")) %>%
arrange(desc(.data[[paste0("mean_", metric)]])) %>%
ungroup() %>%
mutate(Algorithm = factor(Algorithm, levels = interesting_algs)) %>%
ggplot(aes(x=Algorithm, y=get(metric), color=Dataset)) +
geom_point(aes(), size=4) +
geom_line(aes(group=Dataset), linewidth = 1.4) +
ylim(0, 1) +
xlab("") + ylab(ifelse(metric == "MCC_mean", "Matthews correlation coefficient", ifelse(metric == "AMI_mean", "Adjusted mutual information", "Adjusted rand index"))) +
theme_cowplot() +
scale_color_manual(values = dataset_colors) +
theme(legend.position = "None", text = element_text(family = "Times"))
filename <- paste0("top_", gsub(extraw_metrics_performance, "", metric))
ggsave(file.path(paper_figures_folder, paste0(filename,".png")), figure, bg = 'white')
if (metric == metrics_performance[1]) {
figure <- figure +
theme(axis.text.x = element_text(size = 18), axis.title.y = element_text(size = 20), axis.text.y = element_text(size = 18), plot.margin=grid::unit(c(0,-3,-5,0), "mm"))
# figure <- figure + theme(legend.position = "right") + guides(color= guide_legend(ncol=3))
ggsave(file.path(paper_figures_folder, paste0(filename,".svg")), figure, bg = 'white')
ggsave(file.path(paper_figures_folder, paste0(filename,".pdf")), figure, height = 5)
# ggsave(file.path(paper_figures_folder, paste0(filename,".pdf")), figure)
}
}
Top methods comparison. Score for the top-algorithms and the baseline method (K-means).
Code
table_show <- results_complete %>%
group_by(Dataset) %>%
mutate(across(all_of(metrics_performance), ~ rank(-.), .names ="{.col}_rank")) %>%
ungroup() %>%
rename_with(~ str_remove(., extraw_metrics_performance), ends_with("_rank")) %>%
{
metric_cols <- grep(paste0("^(", paste(gsub(extraw_metrics_performance, "", metrics_performance), collapse = "|"), ")_(mean|std|rank)$"),
colnames(.), value = TRUE)
mean_cols <- grep("_mean$", metric_cols, value = TRUE)
std_cols <- grep("_std$", metric_cols, value = TRUE)
rank_cols <- grep("_rank$", metric_cols, value = TRUE)
ordered_cols <- c(rbind(mean_cols, std_cols, rank_cols))
ordered_cols <- ordered_cols[!is.na(ordered_cols)]
select(., Dataset, Algorithm, all_of(ordered_cols))
} %>%
arrange(Dataset, Algorithm) %>%
mutate(across(where(is.numeric), round, digits = 2))
reactable::reactable(table_show, pagination = TRUE, sortable = TRUE, defaultPageSize = 10)All results. Table with all results, showing mean, standard deviation and rank for each metric.
4.1.2 Internal cluster indices
Code
filtered_table <- expand.grid(Dataset = unique(uns_metrics_by_com_alg$Dataset),
Algorithm = unique(uns_metrics_by_com_alg$Algorithm),
Amputation_Mechanism = "Resampling") %>%
merge(uns_metrics_by_com_alg, all = TRUE) %>%
filter(Amputation_Mechanism == "Resampling") %>%
group_by(Dataset) %>%
mutate(
Silh = rank(Silhouette),
VRC = rank(VRC),
DB = rank(-DB),
DBCV = rank(-DBCV),
Dunn = rank(Dunn),
DHI = rank(-DHI),
SSEI = rank(-SSEI),
RSI = rank(RSI),
BHI = rank(-BHI)) %>%
ungroup() %>%
group_by(Algorithm) %>%
summarize_if(is.numeric, mean) %>%
mutate(mean_rank = rowMeans(select(., where(is.numeric)), na.rm = TRUE)) %>%
mutate(Algorithm = factor(Algorithm, levels = unique(Algorithm[order(mean_rank, decreasing = TRUE)]))) %>%
relocate(DB, .after=RSI) %>%
relocate(DHI, .after=DBCV) %>%
select(-c(AMI_Stab, ARI_Stab, mean_rank, Missing_Percentage, Silhouette))
sorted_algs <- levels(filtered_table$Algorithm)
figure <- filtered_table %>%
ggRadar(aes(facet=Algorithm), colour = Algorithm, rescale = FALSE, interactive = FALSE, size = 1, plot.legend = FALSE, alpha = 0.5) +
theme_bw() +
theme(legend.position="none", axis.ticks.y=element_blank(), text = element_text(family = "Times")) +
scale_y_continuous(breaks = NULL) +
scale_fill_manual(values = algorithm_colors) +
scale_color_manual(values = algorithm_colors)
filename <- "cleval"
ggsave(file.path(paper_figures_folder, paste0(filename, ".png")), figure, width = 8, height = 8, bg = 'white')
figure$data <- figure$data[figure$data$Algorithm %in% pull(results_summary[results_summary[,"# Datasets"] >= 15,], "Algorithm"),]
figure$data <- figure$data[figure$data$Algorithm %in% unique(figure$data$Algorithm)[1:6],]
figure <- figure +
theme(axis.text.x = element_text(size = 7), strip.text = element_text(size = 7), panel.margin = unit(0.1, "lines"), plot.margin=grid::unit(c(0,0,-6,-5), "mm"), text = element_text(family = "Times")) +
scale_fill_manual(values = rep("#BCCACF", length(unique(figure$data$Algorithm)))) +
scale_color_manual(values = rep("#9EC9D9", length(unique(figure$data$Algorithm))))
ggsave(file.path(paper_figures_folder, paste0(filename, ".svg")), figure, width = 2.5, height = 2., bg = 'white')
ggsave(file.path(paper_figures_folder, paste0(filename, ".pdf")), figure, width = 2.5, height = 2.)Cluster structure quality. Radar chart illustrating various unsupervised internal indices. Each metric was computed based on the algorithm’s rank across datasets, with the mean rank used for comparison. Algorithms are sorted by their mean rank.
Code
table_show <- filtered_table %>%
arrange(Algorithm) %>%
mutate(across(where(is.numeric), \(x) round(length(x)-x, digits = 1))) %>%
left_join(results_complete %>% count(Algorithm, name = "# Datasets"), by = "Algorithm") %>%
arrange(Algorithm)
reactable::reactable(table_show, pagination = TRUE, sortable = TRUE, defaultPageSize = 10)Internal metrics rank. Rank of the algorithms by dataset for each internal index.
Code
table_show <- uns_metrics_by_com_alg %>%
filter(Amputation_Mechanism == "Resampling") %>%
select(-c(Missing_Percentage, Amputation_Mechanism, Imputation, Strategy, all_of(metrics_stab))) %>%
arrange(Dataset, Algorithm) %>%
mutate(across(where(is.numeric), \(x) round(x, digits = 2)))
reactable::reactable(table_show, pagination = TRUE, sortable = TRUE, defaultPageSize = 10)All results. Table with all results, showing mean for each metric.
4.1.3 Stability
Code
stability_results <- uns_metrics_by_com_alg[uns_metrics_by_com_alg["Amputation_Mechanism"] == "Resampling",]4.1.3.1 Aggregated
Code
metric <- metrics_stab[1]
results_summary <- stability_results %>%
select(Dataset, Algorithm, all_of(metric)) %>%
group_by(Dataset) %>%
mutate(rank = rank(-!!as.name(metric))) %>%
group_by(Algorithm) %>%
summarise(
"# in Top 3 Models" = sum(rank <= 3),
"# Best" = sum(rank == 1),
"Average Rank" = mean(rank),
"Sd Rank" = sd(rank),
) %>%
arrange(-`# in Top 3 Models`, -`# Best`, `Average Rank`) %>%
mutate_if(is.numeric, round, digits= 1) %>%
left_join(results_complete %>% count(Algorithm, name = "# Datasets"), by = "Algorithm")
sorted_algs <- results_summary$Algorithm
reactable::reactable(results_summary, pagination = TRUE, sortable = TRUE, defaultPageSize = 10)Code
# print(xtable(results_summary[1:10,] %>% mutate("Average Rank" = paste0(round(`Average Rank`, 1), "\u00B1", round(`Sd Rank`, 1))) %>% select(-c(`Sd Rank`))), include.rownames=FALSE)Cluster stability summary. Summary of performance, showing the number of times each method was within the top-3 methods, the number of times they achieved the best score, the average rank (mean\(\pm\)standard deviation), and the number of datasets evaluated.
Code
ba <- expand.grid(Dataset = unique(stability_results$Dataset),
Algorithm = unique(stability_results$Algorithm),
Amputation_Mechanism = "Resampling") %>%
merge(stability_results, all = TRUE) %>%
select(Dataset, Algorithm, all_of(metrics_stab)) %>%
group_by(Dataset) %>%
mutate(across(where(is.numeric), ~ if_else(is.na(.), .[Algorithm == baseline], .))) %>%
mutate(across(where(is.numeric), ~ if_else(is.na(.), 0, .))) %>%
ungroup() %>%
droplevels() %>%
group_by(Algorithm) %>%
mutate(across(where(is.numeric), ~ mean(.), .names = "mean_{col}")) %>%
rename(task_id = Dataset, learner_id = Algorithm) %>%
as_benchmark_aggr()
global_significance <- ba$friedman_test(p.adjust.method = "fdr")$p.value
y_height <- -0.5
minimize <- FALSE
for (metric in metrics_stab) {
cd <- ba$.__enclos_env__$private$.crit_differences(meas = metric, minimize = minimize, baseline= baseline, test = "bd", friedman_global = F)
cdx <- as.numeric(unlist(subset(cd$data, baseline == 1, "mean_rank")))
figure <- autoplot(ba, type = "cd", meas = metric, minimize = minimize, style= 2, friedman_global = F) +
scale_color_manual(values = algorithm_colors) +
annotate("segment", x = c(cdx - cd$cd, cdx + cd$cd), xend = cdx, y = y_height, yend = y_height, alpha = 0.9, color = "black", linewidth = 1.3) +
annotate("segment", x = c(cdx - cd$cd, cdx + cd$cd), xend = c(cdx - cd$cd, cdx + cd$cd), y = y_height+0.25, yend = y_height-0.25, alpha = 0.9, color = "black", linewidth = 1.3) +
annotate("text", label = paste("Critical Difference (baseline) =", round(cd$cd, 2), sep = " "),
y = y_height - 0.5, x = cdx) +
theme(text = element_text(family = "Times"))
figure$layers[[5]]$aes_params$label <- paste0(figure$layers[[5]]$aes_params$label, "; ", "Friedman test,p=", format(global_significance[[metric]], digits = 2))
figure$layers[[5]]$mapping$y <- figure$layers[[6]]$data$y + 0.4
figure$layers[[5]]$mapping$x <- mean(figure$data$mean_rank)
figure$layers[[4]]$aes_params$size <- 3
figure$data$xend <- pmin(pmax(figure$data$xend, 2), dim(figure$data)[1]-1)
figure$layers[[4]]$aes_params$colour <- "black"
figure$layers[[4]]$aes_params$hjust <- ifelse(figure$data$right, 1, 0)
filename <- paste0("rank_stab_", gsub(extraw_metrics_stab, "", metric))
ggsave(file.path(paper_figures_folder, paste0(filename,".png")), figure, width = 20, height = 13.5, units = "cm", dpi = 300, bg = 'white')
if (metric == metrics_stab[1]) {
figure <- figure +
theme(axis.text.x = element_text(size = 14), axis.title.x = element_text(size = 18))
ggsave(file.path(paper_figures_folder, paste0(filename,".svg")), figure, bg = 'white')
ggsave(file.path(paper_figures_folder, paste0(filename,".pdf")), figure, bg = 'white')
}
}
Critical difference plot. Critical difference plot showing the average rank of the methods. Missing results were estimated with the baseline (K-means), what may have introduced artifacts, artificially making the performance appear closer to the baseline than they actually are. Friedman global test indicates that there is a significant difference in the rankings of the methods over all the datasets.
Code
for (metric in metrics_stab) {
figure <- stability_results %>%
select(Dataset, Algorithm, metric, Strategy) %>%
filter(!(is.na(get(metric)))) %>%
group_by(Dataset) %>%
mutate("{metric}" := !!as.name(metric) / max(!!as.name(metric))) %>%
ungroup() %>%
group_by(Algorithm) %>%
mutate(across(where(is.numeric), ~ mean(.), .names = "mean_{col}")) %>%
arrange(desc(.data[[paste0("mean_", metric)]])) %>%
ungroup() %>%
mutate(Algorithm = factor(Algorithm, levels = unique(Algorithm))) %>%
ggplot(aes(x=Algorithm, y=get(metric), fill=Strategy)) +
geom_boxplot(outliers = FALSE, varwidth = TRUE) +
ylim(0, 1) +
xlab("") + ylab(paste("Normalized", ifelse(metric == "AMI_Stab", "adjusted mutual information", "adjusted rand index"))) +
theme_cowplot() +
scale_x_discrete(guide = guide_axis(angle = 60)) +
scale_fill_manual(values = c(
"Deep learning" = scales::hue_pal()(6)[1],
"Early-fusion" = scales::hue_pal()(6)[2],
"Ensemble" = scales::hue_pal()(6)[3],
"Graph" = scales::hue_pal()(6)[4],
"Kernel" = scales::hue_pal()(6)[5],
"Subspace" = scales::hue_pal()(6)[6]
)) +
geom_point(aes(x=Algorithm, y=.data[[paste0("mean_", metric)]]), size = 2, shape=2, color= "green", show.legend = FALSE) +
theme(legend.position = c(0.01, 0.14), legend.direction = "horizontal", legend.box.background = element_rect(color = "black"), legend.box.margin = margin(t = 3, r = 3, l=2), text = element_text(family = "Times")) +
guides(fill = guide_legend(nrow = 3, title.position="top"))
filename <- paste0("boxplots_stab_", gsub(extraw_metrics_stab, "", metric))
ggsave(file.path(paper_figures_folder, paste0(filename,".png")), figure, width = 10, height = 8, bg = 'white')
figure <- figure +
theme(legend.position = c(0.03, 0.16), legend.text=element_text(size=12), axis.text.x = element_text(size = 15), axis.title.y = element_text(size = 15))
if (metric == metrics_stab[1]) {
figure$data <- figure$data[figure$data$Algorithm %in% unique(figure$data$Algorithm)[1:10],]
ggsave(file.path(paper_figures_folder, paste0(filename,".svg")), figure, bg = 'white')
ggsave(file.path(paper_figures_folder, paste0(filename,".pdf")), figure)
}
}
Soft-ranking: Normalized metrics across datasets. Boxplots show the 25th, 50th, 75th, and 100th percentiles, with the width indicating the number of datasets on which each algorithm was computed. The green triangle marks the mean score for each method. For the normalization step, the score was divided by the result of the best-performing algorithm for each dataset, ensuring the best solution always received a normalized score of 1. Although MONET emerged as the best-performing algorithms when using a soft-ranking, this result may be influenced by the fact that the method was only evaluated on the smallest (and easiest) datasets.
4.1.3.2 Individual
Code
for (metric in metrics_stab) {
figure <- stability_results %>%
select(Dataset, Algorithm, metric) %>%
group_by(Dataset) %>%
mutate(Best = (get(metric) == max(get(metric), na.rm = TRUE))) %>%
ungroup() %>%
mutate(
Algorithm = factor(Algorithm, levels = sorted_algs),
Dataset = factor(Dataset, levels = results_complete %>%
group_by(Dataset) %>% summarize(n = n()) %>%
ungroup() %>% arrange(n) %>% pull(Dataset)),
PointType = ifelse(Best & !is.na(Best), "Best", "Other")
) %>%
ggplot(aes(x = Algorithm, y = Dataset, size = get(metric), color = PointType)) +
geom_point(alpha = 0.75) +
scale_color_manual(values = c("Other" = "lightblue", "Best" = "purple"), name = "") +
xlab("") + ylab("") +
scale_size(name = gsub(extraw_metrics_stab, "", metric), range = c(1.5, 10), breaks = c(0, 0.2, 0.4, 0.6, 0.8, 1)) +
theme_cowplot(font_family = "Times") +
scale_x_discrete(guide = guide_axis(angle = 60))
filename <- paste0("bubble_stab_", gsub(extraw_metrics_stab, "", metric))
ggsave(file.path(paper_figures_folder, paste0(filename,".png")), figure, width = 10, height = 6, bg = 'white')
figure <- figure +
theme(axis.text.x = element_text(size = 18), axis.text.y = element_text(size = 18))
if (metric == metrics_stab[1]) {
ggsave(file.path(paper_figures_folder, paste0(filename,".svg")), figure, width = 10, height = 6, bg = 'white')
ggsave(file.path(paper_figures_folder, paste0(filename,".pdf")), figure, width = 12, height = 6)
}
}
Individual performance of the algorithms. Bubble plot where the size of each bubble represents the average performance of the algorithm on each dataset.
Code
for (metric in metrics_stab) {
normalized_results_complete <- stability_results %>%
select(Dataset, Algorithm, all_of(metric)) %>%
group_by(Dataset) %>%
mutate("{metric}" := !!as.name(metric) / max(!!as.name(metric))) %>%
ungroup() %>%
group_by(Algorithm) %>%
mutate(across(where(is.numeric), ~ mean(.), .names = "mean_{col}")) %>%
arrange(desc(.data[[paste0("mean_", metric)]])) %>%
ungroup()
alg_coms <- combs(sorted_algs, 2) %>%
as.data.frame() %>%
mutate(
Alg1 = V1,
Alg2 = V2,
p_value = NA
) %>%
ungroup() %>%
select(Alg1, Alg2, p_value)
for (pw_algs in 1:nrow(alg_coms)){
alg1 <- alg_coms[pw_algs,"Alg1"]
alg2 <- alg_coms[pw_algs,"Alg2"]
filtered_data <- normalized_results_complete %>%
filter(Algorithm %in% c(alg1, alg2)) %>%
group_by(Dataset) %>%
filter(n_distinct(Algorithm) == 2) %>%
ungroup()
if (nrow(filtered_data) > 0) {
kruskal_result <- kruskal.test(as.formula(paste(metric, "~ Algorithm")), data = filtered_data)
alg_coms[pw_algs,"p_value"] <- kruskal_result$p.value
}
}
figure <- alg_coms %>%
filter(!is.na(p_value)) %>%
mutate(
Alg1 = factor(Alg1, levels = sorted_algs[sorted_algs %in% Alg1]),
Alg2 = factor(Alg2, levels = rev(sorted_algs[sorted_algs %in% Alg2])),
p_value = round(p_value, digits = 2)
) %>%
arrange(Alg1, Alg2) %>%
ggplot(aes(x = Alg1, y = Alg2, fill = p_value)) +
geom_tile() +
geom_text(aes(label = ifelse(p_value < 0.05, ifelse(p_value <= 0.01, ifelse(p_value <= 0.001, "***", "**"), "*"), "")), color = "black", size=6) +
scale_fill_gradientn(
colors = c("red", "orange", "yellow", "white", "white"),
values = scales::rescale(c(0, 0.05, 0.1, 0.15, 1)),
limits = c(0, 1),
name = "p-value",
na.value = "grey",
guide = guide_colorbar(frame.colour = "black", ticks.colour = "black")
) +
xlab("") + ylab("") +
theme_cowplot() +
theme(axis.text.x=element_text(angle=90, hjust=1, vjust= 0.5), legend.position = c(0.8,0.8), text = element_text(family = "Times")) +
coord_cartesian(clip = "off")
filename <- paste0("diff_stab_", gsub(extraw_metrics_stab, "", metric))
ggsave(file.path(paper_figures_folder, paste0(filename, ".png")), figure, width = 12, height = 9, bg = 'white')
if (metric == metrics_stab[1]) {
figure <- figure + theme(axis.text.x=element_text(angle=90, hjust=1, vjust= 0.5, size=13), axis.text.y=element_text(size=13), legend.position = c(0.8,0.8), plot.margin=grid::unit(c(0,0,-5,-5), "mm"))
# figure$layers[[2]]$aes_params$size <- 4.3
ggsave(file.path(paper_figures_folder, paste0(filename, ".svg")), figure, width = 12, height = 9, bg = 'white')
ggsave(file.path(paper_figures_folder, paste0(filename, ".pdf")), figure, width = 12, height = 9)
}
}
Statistical differences in scoring. Statistical differences of the metrics across methods using pairwise comparison with Kruskal-Wallis test.
Code
interesting_algs <- c("IMSR", "SNF", "NEMO", "SIMCADC", baseline)
for (metric in metrics_stab) {
figure <- stability_results %>%
select(Dataset, Algorithm, metric) %>%
filter(Algorithm %in% interesting_algs) %>%
group_by(Algorithm) %>%
mutate(across(where(is.numeric), ~ mean(.), .names = "mean_{col}")) %>%
arrange(desc(.data[[paste0("mean_", metric)]])) %>%
ungroup() %>%
mutate(Algorithm = factor(Algorithm, levels = interesting_algs)) %>%
ggplot(aes(x=Algorithm, y=get(metric), color=Dataset)) +
geom_point(aes(), size=4) +
geom_line(aes(group=Dataset), linewidth = 1.4) +
ylim(0, 1) +
xlab("") + ylab(ifelse(metric == "AMI_Stab", "Adjusted mutual information", "Adjusted rand index")) +
theme_cowplot() +
scale_color_manual(values = dataset_colors) +
theme(legend.position = "None", text = element_text(family = "Times"))
filename <- paste0("top_stab_", gsub(extraw_metrics_stab, "", metric))
ggsave(file.path(paper_figures_folder, paste0(filename,".png")), figure, bg = 'white')
if (metric == metrics_stab[1]) {
figure <- figure + theme(axis.text.x = element_text(size = 18), axis.title.y = element_text(size = 20), axis.text.y = element_text(size = 18), plot.margin=grid::unit(c(0,-3,-5,0), "mm"))
ggsave(file.path(paper_figures_folder, paste0(filename,".svg")), figure, bg = 'white')
ggsave(file.path(paper_figures_folder, paste0(filename,".pdf")), figure)
}
}
Top methods comparison. Score for the top-algorithms and the baseline method (K-means).
Code
table_show <- stability_results %>%
group_by(Dataset) %>%
mutate(across(all_of(metrics_stab), ~ rank(-.), .names ="{.col}_rank")) %>%
ungroup() %>%
rename_with(~ str_remove(., extraw_metrics_stab)) %>%
{
ordered_cols <- c(c(gsub(extraw_metrics_stab, "", metrics_stab), paste0(gsub(extraw_metrics_stab, "", metrics_stab), "_rank")))
ordered_cols <- sort(ordered_cols)
select(., Dataset, Algorithm, all_of(ordered_cols))
} %>%
arrange(Dataset, Algorithm) %>%
mutate(across(where(is.numeric), round, digits = 2))
reactable::reactable(table_show, pagination = TRUE, sortable = TRUE, defaultPageSize = 10)All results. Table with all results, showing mean, standard deviation and rank for each metric.
4.1.4 Computational efficiency
4.1.4.1 Agregatted
Code
time_matrix <- resources_df %>%
select(Dataset, Algorithm, Time)
for (axis in c("All", "Samples", "Features")) {
for (imputing in c(TRUE, FALSE)) {
titlesuffix <- ifelse(axis=="All", "Cell", ifelse(axis=="Samples", "Sample", "Feature"))
figure <- time_matrix %>%
group_by(Dataset) %>%
mutate(Time = ifelse(imputing & is.na(Time), max(Time, na.rm = TRUE), Time)) %>%
ungroup() %>%
left_join(select(dataset_table, Dataset, Samples, Features = "Total features", All= Cells), by = "Dataset") %>%
mutate(Time = Time / !!as.name(axis)) %>%
group_by(Algorithm) %>%
summarise(
avg_time = mean(Time, na.rm = TRUE),
) %>%
arrange(avg_time) %>%
mutate(
Algorithm = factor(Algorithm, levels = unique(Algorithm)),
RelativeSpeed = avg_time / min(avg_time)
) %>%
left_join(algorithm_families, by = "Algorithm") %>%
ggplot(aes(x = reorder(Algorithm, avg_time), y = avg_time, fill=Strategy)) +
geom_bar(stat = "identity") +
geom_text(aes(label = ifelse(RelativeSpeed >= 1000,
paste0(round(RelativeSpeed / 1000), "K"),
ifelse(RelativeSpeed >= 10,
round(RelativeSpeed),
round(RelativeSpeed, 1)))), vjust = -0.3, size= 3) +
ylab(paste("Average time (s) per", tolower(titlesuffix))) + xlab("") +
theme_cowplot() +
theme(axis.text.x=element_text(angle=60, hjust=1), plot.margin=grid::unit(c(1,5,-6,0), "mm"), legend.position= c(0.05, 0.7), text = element_text(family = "Times")) +
coord_cartesian(clip = "off")
filename <- "time_barplot"
filename <- ifelse(imputing, paste0(filename, "_imputed"), filename)
filename <- paste0(filename, "_", tolower(axis))
ggsave(file.path(paper_figures_folder, paste0(filename, ".png")), figure, bg = 'white')
if (!(imputing) & (axis == "Samples")) {
figure <- figure + theme(axis.text.x=element_text(angle=60, hjust=1, size=22), axis.text.y=element_text(size=22), axis.title.y=element_text(size=22), plot.margin=grid::unit(c(3,0,-7,0.5), "mm"), legend.text=element_text(size=22), legend.title=element_text(size=22))
figure$layers[[2]]$aes_params$size <- 7
ggsave(file.path(paper_figures_folder, paste0(filename, ".svg")), figure, bg = 'white')
ggsave(file.path(paper_figures_folder, paste0(filename, ".pdf")), figure, width = 13)
}
if (!(imputing) & (axis == "All")) {
avg_time <- figure$data[c("Algorithm", "avg_time")]
}
}
}Average computing time. Average computing time (in seconds) across datasets and divided by the number of elements (samples, features, cells) in each dataset, with relative speed with respect to the fastest algorithm on top.
Code
cpu_matrix <- resources_df %>%
select(Dataset, Algorithm, CPU)
for (axis in c("All", "Samples", "Features")) {
for (imputing in c(TRUE, FALSE)) {
titlesuffix <- ifelse(axis=="All", "Cell", ifelse(axis=="Samples", "Sample", "Feature"))
figure <- cpu_matrix %>%
group_by(Dataset) %>%
mutate(CPU = ifelse(imputing & is.na(CPU), max(CPU, na.rm = TRUE), CPU)) %>%
ungroup() %>%
left_join(select(dataset_table, Dataset, Samples, Features = "Total features", All= Cells), by = "Dataset") %>%
mutate(CPU = CPU / !!as.name(axis)) %>%
group_by(Algorithm) %>%
summarise(
avg_cpu = mean(CPU, na.rm = TRUE),
) %>%
arrange(avg_cpu) %>%
mutate(
Algorithm = factor(Algorithm, levels = unique(Algorithm)),
RelativeSpeed = avg_cpu / min(avg_cpu)
) %>%
left_join(algorithm_families, by = "Algorithm") %>%
ggplot(aes(x = reorder(Algorithm, avg_cpu), y = avg_cpu, fill=Strategy)) +
geom_bar(stat = "identity") +
geom_text(aes(label = ifelse(RelativeSpeed >= 1000,
paste0(round(RelativeSpeed / 1000, 1), "K"),
ifelse(RelativeSpeed >= 10,
round(RelativeSpeed),
round(RelativeSpeed, 1)))), vjust = -0.3, size= 3) +
ylab(paste("(%) CPU usage per", tolower(titlesuffix))) + xlab("") +
theme_cowplot() +
theme(axis.text.x=element_text(angle=60, hjust=1), plot.margin=grid::unit(c(1,5,-6,0), "mm"), legend.position= c(0.2, 0.7), text = element_text(family = "Times")) +
coord_cartesian(clip = "off")
filename <- "cpu_barplot"
filename <- ifelse(imputing, paste0(filename, "_imputed"), filename)
filename <- paste0(filename, "_", tolower(axis))
ggsave(file.path(paper_figures_folder, paste0(filename, ".png")), figure, bg = 'white')
if (!(imputing) & (axis == "Samples")) {
figure <- figure + theme(axis.text.x=element_text(angle=60, hjust=1, size=22), axis.text.y=element_text(size=22), axis.title.y=element_text(size=22), plot.margin=grid::unit(c(3,0,-7,0.5), "mm"))
figure$layers[[2]]$aes_params$size <- 7
figure$theme$legend.position <- "none"
ggsave(file.path(paper_figures_folder, paste0(filename, ".svg")), figure, bg = 'white')
ggsave(file.path(paper_figures_folder, paste0(filename, ".pdf")), figure, width = 13, height = 5.9)
}
if (!(imputing) & (axis == "All")) {
avg_cpu <- figure$data[c("Algorithm", "avg_cpu")]
}
}
}Average CPU usage. Average (%) CPU usage across datasets and divided by the number of elements (samples, features, cells) in each dataset, with relative usage with respect to the most efficient algorithm on top.
Code
peak_matrix <- resources_df %>%
select(Dataset, Algorithm, Peak)
for (axis in c("All", "Samples", "Features")) {
for (imputing in c(TRUE, FALSE)) {
titlesuffix <- ifelse(axis=="All", "Cell", ifelse(axis=="Samples", "Sample", "Feature"))
figure <- peak_matrix %>%
group_by(Dataset) %>%
mutate(Peak = ifelse(imputing & is.na(Peak), max(Peak, na.rm = TRUE), Peak)) %>%
ungroup() %>%
left_join(select(dataset_table, Dataset, Samples, Features = "Total features", All= Cells), by = "Dataset") %>%
mutate(Peak = Peak / !!as.name(axis)) %>%
group_by(Algorithm) %>%
summarise(
avg_peak = mean(Peak, na.rm = TRUE),
) %>%
arrange(avg_peak) %>%
mutate(
Algorithm = factor(Algorithm, levels = unique(Algorithm)),
RelativeSpeed = avg_peak / min(avg_peak)
) %>%
left_join(algorithm_families, by = "Algorithm") %>%
ggplot(aes(x = reorder(Algorithm, avg_peak), y = avg_peak, fill=Strategy)) +
geom_bar(stat = "identity") +
geom_text(aes(label = ifelse(RelativeSpeed >= 1000,
paste0(round(RelativeSpeed / 1000, 1), "K"),
ifelse(RelativeSpeed >= 10,
round(RelativeSpeed),
round(RelativeSpeed, 1)))), vjust = -0.3, size= 3) +
ylab(paste("Memory (Mb) usage per", tolower(titlesuffix))) + xlab("") +
theme_cowplot() +
theme(axis.text.x=element_text(angle=60, hjust=1), plot.margin=grid::unit(c(1,5,-6,0), "mm"), legend.position= c(0.2, 0.7), text = element_text(family = "Times")) +
coord_cartesian(clip = "off")
filename <- "peak_barplot"
filename <- ifelse(imputing, paste0(filename, "_imputed"), filename)
filename <- paste0(filename, "_", tolower(axis))
ggsave(file.path(paper_figures_folder, paste0(filename, ".png")), figure, bg = 'white')
if (!(imputing) & (axis == "Samples")) {
figure <- figure + theme(axis.text.x=element_text(angle=60, hjust=1, size=22), axis.text.y=element_text(size=22), axis.title.y=element_text(size=22), plot.margin=grid::unit(c(3,0,-7,0.5), "mm"))
figure$layers[[2]]$aes_params$size <- 7
figure$theme$legend.position <- "none"
ggsave(file.path(paper_figures_folder, paste0(filename, ".svg")), figure, bg = 'white')
ggsave(file.path(paper_figures_folder, paste0(filename, ".pdf")), figure, width = 13, height = 5.9)
}
if (!(imputing) & (axis == "All")) {
avg_peak <- figure$data[c("Algorithm", "avg_peak")]
}
}
}Average peak memory usage. Average peak memory (Mb) usage across datasets and divided by the number of elements (samples, features, cells) in each dataset, with relative usage with respect to the most efficient algorithm on top.
Code
memory_matrix <- resources_df %>%
select(Dataset, Algorithm, Memory)
for (axis in c("All", "Samples", "Features")) {
for (imputing in c(TRUE, FALSE)) {
titlesuffix <- ifelse(axis=="All", "Cell", ifelse(axis=="Samples", "Sample", "Feature"))
figure <- memory_matrix %>%
group_by(Dataset) %>%
mutate(Memory = ifelse(imputing & is.na(Memory), max(Memory, na.rm = TRUE), Memory)) %>%
ungroup() %>%
left_join(select(dataset_table, Dataset, Samples, Features = "Total features", All= Cells), by = "Dataset") %>%
mutate(Memory = Memory / !!as.name(axis)) %>%
group_by(Algorithm) %>%
summarise(
avg_memory = mean(Memory, na.rm = TRUE),
) %>%
arrange(avg_memory) %>%
mutate(
Algorithm = factor(Algorithm, levels = unique(Algorithm)),
RelativeSpeed = avg_memory / min(avg_memory)
) %>%
left_join(algorithm_families, by = "Algorithm") %>%
ggplot(aes(x = reorder(Algorithm, avg_memory), y = avg_memory, fill=Strategy)) +
geom_bar(stat = "identity") +
geom_text(aes(label = ifelse(RelativeSpeed >= 1000,
paste0(round(RelativeSpeed / 1000, 1), "K"),
ifelse(RelativeSpeed >= 10,
round(RelativeSpeed),
round(RelativeSpeed, 1)))), vjust = -0.3, size= 3) +
ylab(paste("Memory (Mb) usage per", tolower(titlesuffix))) + xlab("") +
theme_cowplot() +
theme(axis.text.x=element_text(angle=60, hjust=1), plot.margin=grid::unit(c(1,5,-6,0), "mm"), legend.position= c(0.2, 0.8), text = element_text(family = "Times")) +
coord_cartesian(clip = "off")
filename <- "memory_barplot"
filename <- ifelse(imputing, paste0(filename, "_imputed"), filename)
filename <- paste0(filename, "_", tolower(axis))
ggsave(file.path(paper_figures_folder, paste0(filename, ".png")), figure, bg = 'white')
if (!(imputing) & (axis == "Samples")) {
figure <- figure + theme(axis.text.x=element_text(angle=60, hjust=1, size=21), axis.text.y=element_text(size=21), axis.title.y=element_text(size=22), plot.margin=grid::unit(c(3,0,-7,0.5), "mm"))
figure$layers[[2]]$aes_params$size <- 7
figure$theme$legend.position <- "none"
ggsave(file.path(paper_figures_folder, paste0(filename, ".svg")), figure, bg = 'white')
ggsave(file.path(paper_figures_folder, paste0(filename, ".pdf")), figure, width = 13, height = 5.9)
}
if (!(imputing) & (axis == "All")) {
avg_memory <- figure$data[c("Algorithm", "avg_memory")]
}
}
}Average memory usage. Average memory (Mb) usage across datasets and divided by the number of elements (samples, features, cells) in each dataset, with relative usage with respect to the most efficient algorithm on top.
4.1.4.2 Individual
Code
figure <- time_matrix %>%
right_join(avg_time, by="Algorithm") %>%
arrange(desc(avg_time)) %>%
mutate(Algorithm = factor(Algorithm, levels = unique(Algorithm)),
Time = round(Time/60, 1)) %>%
ggplot(aes(x = Dataset, y = Algorithm, fill = Time, colour="")) +
geom_tile() +
geom_text(aes(label = Time), color = "black") +
scale_fill_gradient(low = "green", high = "red", limits = c(0, 60), oob = scales::squish, name = "Time (m)") +
scale_colour_manual(values=NA) +
guides(colour=guide_legend("Error", override.aes=list(colour="black"))) +
ylab("") + xlab("") +
scale_y_discrete(expand = c(0, 0)) +
theme_cowplot() +
theme(axis.text.x=element_text(angle=60, vjust= 0.5), text = element_text(family = "Times")) +
coord_cartesian(clip = "off")
filename <- "time_matrix"
ggsave(file.path(paper_figures_folder, paste0(filename, ".png")), figure, height = 6.5, width = 11, bg = 'white')
ggsave(file.path(paper_figures_folder, paste0(filename, ".svg")), figure, height = 6.5, width = 11, bg = 'white')
ggsave(file.path(paper_figures_folder, paste0(filename, ".pdf")), figure, height = 6.5, width = 11)Computing time. Heatmap of the individual computing times (in minutes).
Code
figure <- cpu_matrix %>%
right_join(avg_cpu, by="Algorithm") %>%
arrange(desc(avg_cpu)) %>%
mutate(Algorithm = factor(Algorithm, levels = unique(Algorithm)), CPU = round(CPU)) %>%
ggplot(aes(x = Dataset, y = Algorithm, fill = CPU, colour="")) +
geom_tile() +
geom_text(aes(label = CPU), color = "black") +
scale_fill_gradient(low = "green", high = "red", oob = scales::squish, name = "(%) CPU") +
scale_colour_manual(values=NA) +
guides(colour=guide_legend("Error", override.aes=list(colour="black"))) +
ylab("") + xlab("") +
scale_y_discrete(expand = c(0, 0)) +
theme_cowplot() +
theme(axis.text.x=element_text(angle=60, vjust= 0.5), text = element_text(family = "Times")) +
coord_cartesian(clip = "off")
filename <- "cpu_matrix"
ggsave(file.path(paper_figures_folder, paste0(filename, ".png")), figure, height = 6.5, width = 11, bg = 'white')
ggsave(file.path(paper_figures_folder, paste0(filename, ".svg")), figure, height = 6.5, width = 11, bg = 'white')
ggsave(file.path(paper_figures_folder, paste0(filename, ".pdf")), figure, height = 6.5, width = 11)CPU usage. Heatmap of the individual % CPU usage.
Code
figure <- peak_matrix %>%
right_join(avg_peak, by="Algorithm") %>%
arrange(desc(avg_peak)) %>%
mutate(Algorithm = factor(Algorithm, levels = unique(Algorithm)), Peak = round(Peak)) %>%
ggplot(aes(x = Dataset, y = Algorithm, fill = Peak, colour="")) +
geom_tile() +
geom_text(aes(label = round(Peak, 1)), color = "black") +
scale_fill_gradient(low = "green", high = "red", limits = c(0, 1000), oob = scales::squish, name = "Peak memory (Mb)") +
scale_colour_manual(values=NA) +
guides(colour=guide_legend("Error", override.aes=list(colour="black"))) +
ylab("") + xlab("") +
scale_y_discrete(expand = c(0, 0)) +
theme_cowplot() +
theme(axis.text.x=element_text(angle=60, vjust= 0.5), text = element_text(family = "Times")) +
coord_cartesian(clip = "off")
filename <- "peak_matrix"
ggsave(file.path(paper_figures_folder, paste0(filename, ".png")), figure, height = 6.5, width = 11, bg = 'white')
ggsave(file.path(paper_figures_folder, paste0(filename, ".svg")), figure, height = 6.5, width = 11, bg = 'white')
ggsave(file.path(paper_figures_folder, paste0(filename, ".pdf")), figure, height = 6.5, width = 11)Peak memory usage. Heatmap of the individual peak memory usage (Mb).
Code
figure <- memory_matrix %>%
right_join(avg_memory, by="Algorithm") %>%
arrange(desc(avg_memory)) %>%
mutate(Algorithm = factor(Algorithm, levels = unique(Algorithm)), Memory = round(Memory/1000, 1)) %>%
ggplot(aes(x = Dataset, y = Algorithm, fill = Memory, colour="")) +
geom_tile() +
geom_text(aes(label = round(Memory, 1)), color = "black") +
scale_fill_gradient(low = "green", high = "red", oob = scales::squish, name = "Memory (Gb)") +
scale_colour_manual(values=NA) +
guides(colour=guide_legend("Error", override.aes=list(colour="black"))) +
ylab("") + xlab("") +
scale_y_discrete(expand = c(0, 0)) +
theme_cowplot() +
theme(axis.text.x=element_text(angle=60, vjust= 0.5), text = element_text(family = "Times")) +
coord_cartesian(clip = "off")
filename <- "memory_matrix"
ggsave(file.path(paper_figures_folder, paste0(filename, ".png")), figure, height = 6.5, width = 11, bg = 'white')
ggsave(file.path(paper_figures_folder, paste0(filename, ".svg")), figure, height = 6.5, width = 11, bg = 'white')
ggsave(file.path(paper_figures_folder, paste0(filename, ".pdf")), figure, height = 6.5, width = 11)Memory usage. Heatmap of the individual memory usage (Gb).
Code
table_show <- time_matrix %>%
right_join(cpu_matrix, by= c("Dataset", "Algorithm")) %>%
right_join(memory_matrix, by= c("Dataset", "Algorithm")) %>%
right_join(peak_matrix, by= c("Dataset", "Algorithm")) %>%
mutate("Time (s)" = round(Time, digits= 2), "CPU (%)" = round(CPU, digits= 2), "Memory (Gb)" = round(Memory, digits= 2), "Peak memory (Mb)" = round(Peak, digits= 2)) %>%
arrange(Dataset, Algorithm) %>%
select(-c(Time, Memory, Peak, CPU))
reactable::reactable(table_show, pagination = TRUE, sortable = TRUE, defaultPageSize = 10)All results.. Table with all computing resources.
4.1.5 Algorithm comparison
Code
filtered_table <- alg_comparisons %>%
mutate_if(is.numeric, round, digits= 2) %>%
mutate(alg1 = factor(alg1)) %>%
mutate(alg2 = factor(alg2, levels = rev(levels(alg1)))) %>%
arrange(alg1, alg2) %>%
distinct(Comparison, .keep_all = TRUE)
metrics_comp <- names(which(sapply(filtered_table, is.numeric)))
for (metric in metrics_comp) {
figure <- filtered_table %>%
ggplot(aes(x = alg1, y = alg2, fill = get(metric))) +
geom_tile() +
geom_text(aes(label = get(metric)), color = "black") +
scale_fill_gradient(low = "white", high = "red", name = metric, limits=c(0, 1)) +
xlab("") + ylab("") +
theme_cowplot() +
theme(axis.text.x=element_text(angle=90, hjust=1, vjust= 0.5), legend.position = c(0.8,0.8), text = element_text(family = "Times")) +
coord_cartesian(clip = "off")
ggsave(file.path(paper_figures_folder, paste0("agreement_", metric, ".png")), figure, width = 12, height = 9, bg = 'white')
if (metric == "AMI") {
ggsave(file.path(paper_figures_folder, paste0("agreement_", metric, ".svg")), figure, width = 12, height = 9, bg = 'white')
ggsave(file.path(paper_figures_folder, paste0("agreement_", metric, ".pdf")), figure, width = 12, height = 9)
}
}
Pairwise algorithm similarity: Similarity between clustering solutions across algorithms.
Code
for (metric in metrics_comp) {
filtered_table <- alg_comparisons %>%
arrange(alg1, alg2) %>%
dcast(alg1 ~ alg2, value.var = metric) %>%
as.matrix()
row.names(filtered_table) <- filtered_table[,1]
filtered_table <- filtered_table[,-1]
diag(filtered_table) <- 1
cnames = colnames(filtered_table)
rnames = rownames(filtered_table)
filtered_table = as.data.frame(filtered_table)
rownames(filtered_table) = rnames
colnames(filtered_table) = cnames
set.seed(0)
vis <- Rtsne(1-apply(as.matrix(filtered_table), 2, as.numeric), is_distance=TRUE, perplexity = floor((nrow(filtered_table) - 1) / 3), normalize=FALSE)
figure <- data.frame(TSNE1 = vis$Y[,1], TSNE2 = vis$Y[,2], Algorithm = rownames(filtered_table)) %>%
left_join(algorithm_families, by = "Algorithm") %>%
ggplot(aes(x = TSNE1, y = TSNE2, label = Algorithm, colour=Strategy)) +
geom_point(size=0) +
geom_text_repel(size =6, seed=0) +
theme_cowplot() +
theme(legend.position="none", text = element_text(family = "Times"))
ggsave(file.path(paper_figures_folder, paste0("network_", metric, ".png")), figure, bg = 'white')
if (metric == "AMI") {
figure <- figure +
theme(legend.position = c(0.575,0.15), legend.box.background = element_rect(color = "black"), legend.box.margin = margin(t = 3, r = 3, l=2)) +
guides(colour = guide_legend(nrow = 3, override.aes=aes(size=4, label="")))
ggsave(file.path(paper_figures_folder, paste0("network_", metric, ".png")), figure, bg = 'white')
figure <- figure +
theme(legend.text=element_text(size=16), axis.text.x = element_text(size = 16), axis.text.y = element_text(size = 16), axis.title.x = element_text(size = 20), axis.title.y = element_text(size = 20))
ggsave(file.path(paper_figures_folder, paste0("network_", metric, ".svg")), figure, bg = 'white')
ggsave(file.path(paper_figures_folder, paste0("network_", metric, ".pdf")), figure)
}
}
Algorithm similarity: Similarity between clustering solutions across algorithms, measured by pairwise comparisons and visualized with t-SNE.
4.2 Evaluation on incomplete data
4.2.1 Comparison with grounth-truth labels
Code
idx <- !(names(algorithm_colors) %in% incomplete_algorithms)
alg_colors2 <- algorithm_colors[idx]
names(alg_colors2) <- paste0("imp_", names(alg_colors2))
algorithm_colors <- c(algorithm_colors, alg_colors2)
basic_com_results <- com_results %>%
filter(Amputation_Mechanism != "Resampling") %>%
mutate(Methodology = ifelse(Imputation, "Mean imputation", "Only complete"))
basic_com_results <- basic_com_results %>%
rbind(
basic_com_results %>%
filter(Amputation_Mechanism == "No") %>%
mutate(Methodology = "Mean imputation")
) %>%
rbind(
basic_com_results %>%
filter(Amputation_Mechanism == "No") %>%
mutate(Methodology = "Whole dataset")
)
results_incomplete <- incom_results %>%
mutate("Real_Alg" = Algorithm, "Methodology" = "Whole dataset") %>%
rbind(
basic_com_results %>%
select(-Strategy) %>%
mutate("Real_Alg" = Algorithm) %>%
mutate(Algorithm = ifelse(Methodology == "Mean imputation", paste0("imp_", Algorithm), ifelse(Methodology == "Only complete", paste0("com_", Algorithm), Algorithm)))
) %>%
mutate("dataset_missing_pattern" = paste0(Dataset, Missing_Percentage, Amputation_Mechanism)) %>%
arrange(Dataset, Missing_Percentage, Algorithm, Amputation_Mechanism, Imputation)4.2.1.1 Aggregated
Code
metric <- metrics_performance[1]
results_summary_input <- results_incomplete %>%
filter(!((Real_Alg %in% incomplete_algorithms) & Imputation)) %>%
filter(Methodology != "Only complete", Missing_Percentage > 0)
results_summary <- results_summary_input %>%
select(dataset_missing_pattern, Dataset, Algorithm, metric, Real_Alg) %>%
group_by(dataset_missing_pattern) %>%
mutate(rank = rank(-!!as.name(metric))) %>%
group_by(Algorithm, Real_Alg) %>%
summarise(
"# in Top 3 Models" = sum(rank <= 3),
"# Best" = sum(rank == 1),
"Average Rank" = mean(rank),
"Sd Rank" = sd(rank),
"# Datasets Best" = n_distinct(Dataset[rank == 1]),
"# Datasets Top 3" = n_distinct(Dataset[rank <= 3]),
) %>%
arrange(-`# in Top 3 Models`, -`# Best`, `Average Rank`) %>%
mutate_if(is.numeric, round, digits= 1) %>%
left_join(results_summary_input %>% filter((!(Real_Alg %in% incomplete_algorithms) & (Imputation == TRUE)) | ((Algorithm %in% incomplete_algorithms) & (Imputation == FALSE))) %>% count(Algorithm, name = "# Conditions"), by = "Algorithm") %>%
select(-c("Real_Alg"))
minimum_conditions <- 500
minimum_conditions <- filter(results_summary, `# Conditions` >= minimum_conditions)$Algorithm
reactable::reactable(results_summary, pagination = TRUE, sortable = TRUE, defaultPageSize = 10)Code
# print(xtable(results_summary[1:10,] %>% mutate("Average Rank" = paste0(round(`Average Rank`, 1), "\u00B1", round(`Sd Rank`, 1)), "# Best" = paste0(`# Best`, "(", `# Datasets Best`, ")"), "# in Top 3 Models" = paste0(`# in Top 3 Models`, "(", `# Datasets Top 3`, ")"), "Algorithm" = gsub("imp_", "(imp)", Algorithm)) %>% select(-c(`Sd Rank`, "# Datasets Best", "# Datasets Top 3"))), include.rownames=FALSE)External validation summary. Summary of performance, showing the number of times each method was among the top three methods (with the number of distinct datasets indicated in parentheses), the number of times it achieved the best score (also with dataset count), the average rank (mean\(\pm\)standard deviation), and the total number of conditions in which it was evaluated.
Code
interesting_algs <- c("NEMO", "IMSR", "PIMVC")
for (metric in metrics_performance) {
baseline_values <- results_incomplete %>%
filter(Amputation_Mechanism == "No", Methodology == "Only complete") %>%
group_by(Dataset) %>%
summarise("{metric}" := max(!!as.name(metric))) %>%
select(Dataset, Baseline = metric)
for (ampt_mech in amputation_mechanisms) {
figure <- results_incomplete %>%
filter(Algorithm %in% minimum_conditions) %>%
filter(!((Real_Alg %in% incomplete_algorithms) & Imputation), Methodology != "Only complete") %>%
filter(Methodology != "Only complete") %>%
filter(!((Missing_Percentage == 0) & (Methodology == "Whole dataset") & (Real_Alg %in% complete_algorithms))) %>%
filter(!((Missing_Percentage == 0) & (Methodology == "Mean imputation") & (Real_Alg %in% incomplete_algorithms))) %>%
filter(Amputation_Mechanism == "No" | ampt_mech == "All" | Amputation_Mechanism == ampt_mech) %>%
select(Dataset, Algorithm, Missing_Percentage, Amputation_Mechanism, Real_Alg, metric) %>%
left_join(baseline_values, by = "Dataset") %>%
mutate("{metric}" := pmax(0, pmin(!!as.name(metric) / Baseline, 1))) %>%
group_by(Algorithm, Missing_Percentage) %>%
summarise("{metric}" := mean(!!as.name(metric))) %>%
filter(Missing_Percentage > 0) %>%
ggplot(aes(x=Missing_Percentage, y=get(metric), group=Algorithm, colour=Algorithm)) +
geom_line(linewidth = 1.2) +
ylim(0, 1) +
scale_x_continuous(breaks=c(10, 25, 50, 75, 90)) +
xlab("Incomplete samples (%)") + ylab(paste("Normalized", ifelse(metric == "MCC_mean", "Matthews correlation coefficient", ifelse(metric == "AMI_mean", "adjusted mutual information", "adjusted rand index")))) +
theme_cowplot(font_family = "Times") +
# scale_color_manual(values = algorithm_colors) +
gghighlight(Algorithm %in% interesting_algs, use_direct_label = TRUE, keep_scales = TRUE, unhighlighted_params = list(linewidth = .6))
ggsave(file.path(paper_figures_folder, paste0("inc_lines_", ampt_mech, "_", gsub(extraw_metrics_performance, "", metric),".png")), figure, bg = 'white')
if (metric == metrics_performance[1]) {
figure$layers[[4]]$aes_params$size <- 7
figure <- figure + ylab("Matthews correlation coefficient")
figure <- figure + theme(text = element_text(size=22), axis.text.x = element_text(size=18), axis.text.y = element_text(size=18))
ggsave(file.path(paper_figures_folder, paste0("inc_lines_", ampt_mech, "_", gsub(extraw_metrics_performance, "", metric),".svg")), figure + theme(element_text(size=18)), bg = 'white')
ggsave(file.path(paper_figures_folder, paste0("inc_lines_", ampt_mech, "_", gsub(extraw_metrics_performance, "", metric),".pdf")), figure + theme(element_text(size=18)))
}
}
}
Aggregated performance. Average performance of each method across datasets for varying levels of incomplete samples and different missigness pattern. To ensure a consistent aggregation across dataset, all scores were normalized to a common scale prior to averaging.
Code
for (metric in metrics_performance) {
baseline_values <- results_incomplete %>%
filter(Amputation_Mechanism == "No", Methodology == "Only complete") %>%
group_by(Dataset) %>%
summarise("{metric}" := max(!!as.name(metric))) %>%
select(Dataset, Baseline = metric)
for (ampt_mech in amputation_mechanisms) {
figure <- results_incomplete %>%
mutate(Methodology = factor(Methodology, levels=c("Whole dataset", "Mean imputation", "Only complete"))) %>%
filter(Real_Alg %in% interesting_algs) %>%
filter(Amputation_Mechanism == "No" | ampt_mech == "All" | Amputation_Mechanism == ampt_mech) %>%
select(Dataset, Algorithm, Missing_Percentage, Amputation_Mechanism, Methodology, Real_Alg, metric) %>%
left_join(baseline_values, by = "Dataset") %>%
mutate("{metric}" := pmax(0, pmin(!!as.name(metric) / Baseline, 1))) %>%
group_by(Algorithm, Missing_Percentage, Methodology, Real_Alg) %>%
summarise("{metric}" := mean(!!as.name(metric))) %>%
arrange(Missing_Percentage, Algorithm, Methodology) %>%
filter(Missing_Percentage > 0) %>%
ggplot(aes(x=Missing_Percentage, y=get(metric), group = interaction(Real_Alg, Methodology), colour=Real_Alg, linetype = Methodology)) +
geom_line(linewidth = 1.2) +
ylim(0, 1) +
scale_linetype_manual(values=c("solid", "dotted", "twodash")) +
scale_x_continuous(breaks=c(10, 25, 50, 75, 90)) +
xlab("Incomplete samples (%)") + ylab(paste("Normalized", ifelse(metric == "MCC_mean", "Matthews correlation coefficient", ifelse(metric == "AMI_mean", "adjusted mutual information", "adjusted rand index")))) +
theme_cowplot() +
# scale_color_manual(values = algorithm_colors) +
theme(legend.position = c(0.05, 0.13), legend.box = "horizontal", text = element_text(family = "Times")) +
guides(colour= guide_legend(title="Algorithm"))
ggsave(file.path(paper_figures_folder, paste0("inc_basic_", ampt_mech, "_", gsub(extraw_metrics_performance, "", metric),".png")), figure, bg = 'white')
if ((metric == metrics_performance[1]) & (ampt_mech == "All")) {
ggsave(file.path(paper_figures_folder, paste0("inc_basic_", ampt_mech, "_", gsub(extraw_metrics_performance, "", metric),".svg")), figure, bg = 'white')
ggsave(file.path(paper_figures_folder, paste0("inc_basic_", ampt_mech, "_", gsub(extraw_metrics_performance, "", metric),".pdf")), figure)
}
}
}
Performance vs common simple approaches. Average performance of top methods able to handle incomplete multi-modal data across datasets compared with mean imputation or excluding incomplete samples for varying levels of incomplete samples and different missigness pattern. To ensure a consistent aggregation across dataset, all scores were normalized to a common scale prior to averaging.
4.2.1.2 Individual
Code
for (metric in metrics_performance) {
baseline_values <- results_incomplete %>%
filter(Amputation_Mechanism == "No", Methodology == "Only complete") %>%
group_by(Dataset) %>%
summarise("{metric}" := max(!!as.name(metric))) %>%
select(Dataset, Baseline = metric)
for (ampt_mech in amputation_mechanisms) {
figure <- results_incomplete %>%
filter(Algorithm %in% minimum_conditions) %>%
filter(!((Real_Alg %in% incomplete_algorithms) & Imputation), Methodology != "Only complete") %>%
filter(Methodology != "Only complete") %>%
filter(!((Missing_Percentage == 0) & (Methodology == "Whole dataset") & (Real_Alg %in% complete_algorithms))) %>%
filter(!((Missing_Percentage == 0) & (Methodology == "Mean imputation") & (Real_Alg %in% incomplete_algorithms))) %>%
filter(Amputation_Mechanism == "No" | ampt_mech == "All" | Amputation_Mechanism == ampt_mech) %>%
select(Dataset, Algorithm, Missing_Percentage, Amputation_Mechanism, Real_Alg, metric) %>%
group_by(Dataset, Algorithm, Missing_Percentage) %>%
summarise("{metric}" := mean(!!as.name(metric))) %>%
filter(Missing_Percentage > 0) %>%
ggplot(aes(x=Missing_Percentage, y=get(metric), group=Algorithm, colour=Algorithm)) +
facet_wrap(~ Dataset) +
geom_line(linewidth = 1.2) +
ylim(0, 1) +
scale_x_continuous(breaks=c(10, 25, 50, 75, 90)) +
xlab("Incomplete samples (%)") + ylab(ifelse(metric == "MCC_mean", "Matthews correlation coefficient", ifelse(metric == "AMI_mean", "Adjusted mutual information", "Adjusted rand index"))) +
theme_cowplot(font_family = "Times") +
# scale_color_manual(values = algorithm_colors) +
gghighlight(Algorithm %in% interesting_algs, use_direct_label = TRUE, keep_scales = TRUE, unhighlighted_params = list(linewidth = .6), calculate_per_facet = TRUE)
ggsave(file.path(paper_figures_folder, paste0("inc_indlines_", ampt_mech, "_", gsub(extraw_metrics_performance, "", metric),".png")), figure, width=12, height = 10, bg = 'white')
if ((metric == metrics_performance[1]) & (ampt_mech == "All")) {
ggsave(file.path(paper_figures_folder, paste0("inc_indlines_", ampt_mech, "_", gsub(extraw_metrics_performance, "", metric),".svg")), figure, width=12, height = 10, bg = 'white')
ggsave(file.path(paper_figures_folder, paste0("inc_indlines_", ampt_mech, "_", gsub(extraw_metrics_performance, "", metric),".pdf")), figure, width=12, height = 10)
}
}
}
Individual performance. Performance of each method for varying levels of incomplete samples and different missigness pattern.
Code
for (metric in metrics_performance) {
baseline_values <- results_incomplete %>%
filter(Amputation_Mechanism == "No", Methodology == "Only complete") %>%
group_by(Dataset) %>%
summarise("{metric}" := max(!!as.name(metric))) %>%
select(Dataset, Baseline = metric)
for (ampt_mech in amputation_mechanisms) {
figure <- results_incomplete %>%
mutate(Methodology = factor(Methodology, levels=c("Whole dataset", "Mean imputation", "Only complete"))) %>%
filter(Real_Alg %in% interesting_algs) %>%
filter(Amputation_Mechanism == "No" | ampt_mech == "All" | Amputation_Mechanism == ampt_mech) %>%
select(Dataset, Algorithm, Missing_Percentage, Amputation_Mechanism, Methodology, Real_Alg, "mean" = metric, "sd"= paste0(gsub(extraw_metrics_performance, "", metric), "_std")) %>%
group_by(Dataset, Algorithm, Missing_Percentage, Methodology, Real_Alg) %>%
summarise("sd" = mean(sd), "mean" := mean(mean)) %>%
arrange(Dataset, Missing_Percentage, Algorithm, Methodology) %>%
filter(Missing_Percentage > 0) %>%
ggplot(aes(x=Missing_Percentage, y=mean, group = interaction(Real_Alg, Methodology), colour=Real_Alg, linetype = Methodology)) +
facet_wrap(~ Dataset) +
geom_line(linewidth = 1.2) +
ylim(0, 1) +
scale_linetype_manual(values=c("solid", "dotted", "twodash")) +
scale_x_continuous(breaks=c(10, 25, 50, 75, 90)) +
xlab("Incomplete samples (%)") + ylab(ifelse(metric == "MCC_mean", "Matthews correlation coefficient", ifelse(metric == "AMI_mean", "Adjusted mutual information", "Adjusted rand index"))) +
theme_cowplot(font_family = "Times") +
# scale_color_manual(values = algorithm_colors) +
guides(colour= guide_legend(title="Algorithm"))
ggsave(file.path(paper_figures_folder, paste0("inc_indbasic_", ampt_mech, "_", gsub(extraw_metrics_performance, "", metric),".png")), figure, width=12, height = 10, bg = 'white')
if ((metric == metrics_performance[1]) & (ampt_mech == "All")) {
ggsave(file.path(paper_figures_folder, paste0("inc_indbasic_", ampt_mech, "_", gsub(extraw_metrics_performance, "", metric),".svg")), figure, width=12, height = 10, bg = 'white')
ggsave(file.path(paper_figures_folder, paste0("inc_indbasic_", ampt_mech, "_", gsub(extraw_metrics_performance, "", metric),".pdf")), figure, width=12, height = 10)
}
}
}
Individual performance vs common simple approaches. Performance of top methods able to handle incomplete multi-modal data compared with mean imputation or excluding incomplete samples for varying levels of incomplete samples and different missigness pattern.
Code
interesting_algs <- "IMSR"
interesting_datasets <- "BBCSport"
for (metric in metrics_performance) {
for (ampt_mech in amputation_mechanisms) {
figure <- results_incomplete %>%
mutate(Methodology = factor(Methodology, levels=c("Whole dataset", "Mean imputation", "Only complete"))) %>%
filter(Real_Alg %in% interesting_algs) %>%
filter(Amputation_Mechanism == "No" | ampt_mech == "All" | Amputation_Mechanism == ampt_mech) %>%
select(Dataset, Algorithm, Missing_Percentage, Amputation_Mechanism, Methodology, Real_Alg, "mean" = metric, "sd"= paste0(gsub(extraw_metrics_performance, "", metric), "_std")) %>%
filter((Dataset == interesting_datasets) & (Real_Alg == interesting_algs)) %>%
group_by(Dataset, Algorithm, Missing_Percentage, Methodology, Real_Alg) %>%
summarise("sd" = mean(sd), "mean" := mean(mean)) %>%
filter(Missing_Percentage > 0) %>%
ggplot(aes(x=Missing_Percentage, y=mean, group = Methodology, colour=Methodology)) +
geom_line() +
ylim(0, 1) +
geom_ribbon(aes(ymax = mean + sd, ymin= mean - sd, fill= Methodology, x=Missing_Percentage), alpha = 0.3) +
scale_x_continuous(breaks=c(10, 25, 50, 75, 90)) +
xlab("Incomplete samples (%)") + ylab(ifelse(metric == "MCC_mean", "Matthews correlation coefficient", ifelse(metric == "AMI_mean", "Adjusted mutual information", "Adjusted rand index"))) +
theme_cowplot(font_family = "Times") +
theme(legend.position = c(0.1, 0.2), legend.text = element_text(size=18), legend.title = element_blank())
ggsave(file.path(paper_figures_folder, paste0("conf_", ampt_mech, "_", gsub(extraw_metrics_performance, "", metric),".png")), figure, bg = 'white')
if ((metric == metrics_performance[1]) & (ampt_mech == "PM")) {
figure <- figure + theme(text = element_text(size=22), axis.text.x = element_text(size=18), axis.text.y = element_text(size=18))
ggsave(file.path(paper_figures_folder, paste0("conf_", ampt_mech, "_", gsub(extraw_metrics_performance, "", metric),".svg")), figure, bg = 'white')
ggsave(file.path(paper_figures_folder, paste0("conf_", ampt_mech, "_", gsub(extraw_metrics_performance, "", metric),".pdf")), figure)
}
}
}
Confidence interval. Confidence interval (mean\(\pm\)standard deviation) for IMSR on BBCSport dataset compared with mean imputation or excluding incomplete samples for varying levels of incomplete samples and different missigness pattern.
Code
table_show <- results_incomplete %>%
filter(Missing_Percentage > 0) %>%
group_by(dataset_missing_pattern) %>%
mutate(across(all_of(metrics_performance), ~ rank(-.), .names ="{.col}_rank")) %>%
ungroup() %>%
rename_with(~ str_remove(., extraw_metrics_performance), ends_with("_rank")) %>%
{
metric_cols <- grep(paste0("^(", paste(gsub(extraw_metrics_performance, "", metrics_performance), collapse = "|"), ")_(mean|std|rank)$"),
colnames(.), value = TRUE)
mean_cols <- grep("_mean$", metric_cols, value = TRUE)
std_cols <- grep("_std$", metric_cols, value = TRUE)
rank_cols <- grep("_rank$", metric_cols, value = TRUE)
ordered_cols <- c(rbind(mean_cols, std_cols, rank_cols))
ordered_cols <- ordered_cols[!is.na(ordered_cols)]
select(., Dataset, Algorithm, Missing_Percentage, Amputation_Mechanism, all_of(ordered_cols))
} %>%
arrange(Dataset, Missing_Percentage, Algorithm, Amputation_Mechanism) %>%
filter(Missing_Percentage > 0) %>%
mutate(across(where(is.numeric), \(x) round(x, digits = 2)))
reactable::reactable(table_show, pagination = TRUE, sortable = TRUE, defaultPageSize = 10)All results.. Table with all results, showing mean, standard deviation and rank for each metric.
4.2.2 Stability
Code
basic_com_results <- uns_metrics_by_com_alg %>%
filter(Amputation_Mechanism != "Resampling") %>%
mutate(Methodology = ifelse(Imputation, "Mean imputation", "Only complete"))
basic_com_results <- basic_com_results %>%
rbind(
basic_com_results %>%
filter(Amputation_Mechanism == "No") %>%
mutate(Methodology = "Mean imputation")
) %>%
rbind(
basic_com_results %>%
filter(Amputation_Mechanism == "No") %>%
mutate(Methodology = "Whole dataset")
)
stab_incomplete <- uns_metrics_by_incom_alg %>%
mutate("Real_Alg" = Algorithm, "Methodology" = "Whole dataset") %>%
rbind(
basic_com_results %>%
select(-Strategy) %>%
mutate("Real_Alg" = Algorithm) %>%
mutate(Algorithm = ifelse(Methodology == "Mean imputation", paste0("imp_", Algorithm), ifelse(Methodology == "Only complete", paste0("com_", Algorithm), Algorithm)))
) %>%
mutate("dataset_missing_pattern" = paste0(Dataset, Missing_Percentage, Amputation_Mechanism)) %>%
arrange(Dataset, Missing_Percentage, Algorithm, Amputation_Mechanism, Imputation)4.2.2.1 Aggregated
Code
metric <- metrics_stab[1]
results_summary_input <- stab_incomplete %>%
filter(!((Real_Alg %in% incomplete_algorithms) & Imputation)) %>%
filter(Methodology != "Only complete", Missing_Percentage > 0)
results_summary <- results_summary_input %>%
select(dataset_missing_pattern, Dataset, Algorithm, metric, Real_Alg) %>%
group_by(dataset_missing_pattern) %>%
mutate(rank = rank(-!!as.name(metric))) %>%
group_by(Algorithm, Real_Alg) %>%
summarise(
"# in Top 3 Models" = sum(rank <= 3),
"# Best" = sum(rank == 1),
"Average Rank" = mean(rank),
"Sd Rank" = sd(rank),
"# Datasets Best" = n_distinct(Dataset[rank == 1]),
"# Datasets Top 3" = n_distinct(Dataset[rank <= 3]),
) %>%
arrange(-`# in Top 3 Models`, -`# Best`, `Average Rank`) %>%
mutate_if(is.numeric, round, digits= 1) %>%
left_join(results_summary_input %>% filter((!(Real_Alg %in% incomplete_algorithms) & (Imputation == TRUE)) | ((Algorithm %in% incomplete_algorithms) & (Imputation == FALSE))) %>% count(Algorithm, name = "# Conditions"), by = "Algorithm") %>%
select(-c("Real_Alg"))
minimum_conditions <- 500
minimum_conditions <- filter(results_summary, `# Conditions` >= minimum_conditions)$Algorithm
reactable::reactable(results_summary, pagination = TRUE, sortable = TRUE, defaultPageSize = 10)Code
# print(xtable(results_summary[1:10,] %>% mutate("Average Rank" = paste0(round(`Average Rank`, 1), "\u00B1", round(`Sd Rank`, 1)), "# Best" = paste0(`# Best`, "(", `# Datasets Best`, ")"), "# in Top 3 Models" = paste0(`# in Top 3 Models`, "(", `# Datasets Top 3`, ")"), "Algorithm" = gsub("imp_", "(imp)", Algorithm)) %>% select(-c(`Sd Rank`, "# Datasets Best", "# Datasets Top 3"))), include.rownames=FALSE)Cluster stability summary. Summary of performance, showing the number of times each method was among the top three methods (with the number of distinct datasets indicated in parentheses), the number of times it achieved the best score (also with dataset count), the average rank (mean\(\pm\)standard deviation), and the total number of conditions in which it was evaluated.
Code
interesting_algs <- c("NEMO", "IMSR", "PIMVC")
for (metric in metrics_stab) {
baseline_values <- stab_incomplete %>%
filter(Amputation_Mechanism == "No", Methodology == "Only complete") %>%
group_by(Dataset) %>%
summarise("{metric}" := max(!!as.name(metric))) %>%
select(Dataset, Baseline = metric)
for (ampt_mech in amputation_mechanisms) {
figure <- stab_incomplete %>%
filter(Algorithm %in% minimum_conditions) %>%
filter(!((Real_Alg %in% incomplete_algorithms) & Imputation), Methodology != "Only complete") %>%
filter(Methodology != "Only complete") %>%
filter(!((Missing_Percentage == 0) & (Methodology == "Whole dataset") & (Real_Alg %in% complete_algorithms))) %>%
filter(!((Missing_Percentage == 0) & (Methodology == "Mean imputation") & (Real_Alg %in% incomplete_algorithms))) %>%
filter(Amputation_Mechanism == "No" | ampt_mech == "All" | Amputation_Mechanism == ampt_mech) %>%
select(Dataset, Algorithm, Missing_Percentage, Amputation_Mechanism, Real_Alg, metric) %>%
left_join(baseline_values, by = "Dataset") %>%
mutate("{metric}" := pmax(0, pmin(!!as.name(metric) / Baseline, 1))) %>%
group_by(Algorithm, Missing_Percentage) %>%
summarise("{metric}" := mean(!!as.name(metric))) %>%
filter(Missing_Percentage > 0) %>%
ggplot(aes(x=Missing_Percentage, y=get(metric), group=Algorithm, colour=Algorithm)) +
geom_line(linewidth = 1.2) +
ylim(0, 1) +
scale_x_continuous(breaks=c(10, 25, 50, 75, 90)) +
xlab("Incomplete samples (%)") + ylab(paste("Normalized", ifelse(metric == "AMI_Stab", "adjusted mutual information", "adjusted rand index"))) +
theme_cowplot(font_family = "Times") +
# scale_color_manual(values = algorithm_colors) +
gghighlight(Algorithm %in% interesting_algs, use_direct_label = TRUE, keep_scales = TRUE, unhighlighted_params = list(linewidth = .6))
ggsave(file.path(paper_figures_folder, paste0("inc_lines_stab_", ampt_mech, "_", gsub(extraw_metrics_stab, "", metric),".png")), figure, bg = 'white')
if ((metric == metrics_stab[1]) & (ampt_mech == "All")) {
figure$layers[[4]]$aes_params$size <- 7
figure <- figure + ylab("Adjusted mutual information")
figure <- figure + theme(text = element_text(size=22), axis.text.x = element_text(size=18), axis.text.y = element_text(size=18))
ggsave(file.path(paper_figures_folder, paste0("inc_lines_stab_", ampt_mech, "_", gsub(extraw_metrics_stab, "", metric),".svg")), figure, bg = 'white')
ggsave(file.path(paper_figures_folder, paste0("inc_lines_stab_", ampt_mech, "_", gsub(extraw_metrics_stab, "", metric),".pdf")), figure)
}
}
}
Aggregated performance. Average performance of each method across datasets for varying levels of incomplete samples and different missigness pattern. To ensure a consistent aggregation across dataset, all scores were normalized to a common scale prior to averaging.
Code
for (metric in metrics_stab) {
baseline_values <- stab_incomplete %>%
filter(Amputation_Mechanism == "No", Methodology == "Only complete") %>%
group_by(Dataset) %>%
summarise("{metric}" := max(!!as.name(metric))) %>%
select(Dataset, Baseline = metric)
for (ampt_mech in amputation_mechanisms) {
figure <- stab_incomplete %>%
mutate(Methodology = factor(Methodology, levels=c("Whole dataset", "Mean imputation", "Only complete"))) %>%
filter(Real_Alg %in% interesting_algs) %>%
filter(Amputation_Mechanism == "No" | ampt_mech == "All" | Amputation_Mechanism == ampt_mech) %>%
select(Dataset, Algorithm, Missing_Percentage, Amputation_Mechanism, Methodology, Real_Alg, metric) %>%
left_join(baseline_values, by = "Dataset") %>%
mutate("{metric}" := pmax(0, pmin(!!as.name(metric) / Baseline, 1))) %>%
group_by(Algorithm, Missing_Percentage, Methodology, Real_Alg) %>%
summarise("{metric}" := mean(!!as.name(metric))) %>%
arrange(Missing_Percentage, Algorithm, Methodology) %>%
filter(Missing_Percentage > 0) %>%
ggplot(aes(x=Missing_Percentage, y=get(metric), group = interaction(Real_Alg, Methodology), colour=Real_Alg, linetype = Methodology)) +
geom_line(linewidth = 1.2) +
ylim(0, 1) +
scale_linetype_manual(values=c("solid", "dotted", "twodash")) +
scale_x_continuous(breaks=c(10, 25, 50, 75, 90)) +
xlab("Incomplete samples (%)") + ylab(paste("Normalized", ifelse(metric == "AMI_Stab", "adjusted mutual information", "adjusted rand index"))) +
theme_cowplot() +
# scale_color_manual(values = algorithm_colors) +
theme(legend.position = c(0.05, 0.13), legend.box = "horizontal", text = element_text(family = "Times")) +
guides(colour= guide_legend(title="Algorithm"))
ggsave(file.path(paper_figures_folder, paste0("inc_basic_stab_", ampt_mech, "_", gsub(extraw_metrics_stab, "", metric),".png")), figure, bg = 'white')
if ((metric == metrics_stab[1]) & (ampt_mech == "All")) {
ggsave(file.path(paper_figures_folder, paste0("inc_basic_stab_", ampt_mech, "_", gsub(extraw_metrics_stab, "", metric),".svg")), figure, bg = 'white')
ggsave(file.path(paper_figures_folder, paste0("inc_basic_stab_", ampt_mech, "_", gsub(extraw_metrics_stab, "", metric),".pdf")), figure)
}
}
}
Comparison with common simple approaches. Average performance of top methods able to handle incomplete multi-modal data across datasets compared with mean imputation or excluding incomplete samples for varying levels of incomplete samples and different missigness pattern. To ensure a consistent aggregation across dataset, all scores were normalized to a common scale prior to averaging.
4.2.2.2 Individual
Code
for (metric in metrics_stab) {
baseline_values <- stab_incomplete %>%
filter(Amputation_Mechanism == "No", Methodology == "Only complete") %>%
group_by(Dataset) %>%
summarise("{metric}" := max(!!as.name(metric))) %>%
select(Dataset, Baseline = metric)
for (ampt_mech in amputation_mechanisms) {
figure <- stab_incomplete %>%
filter(Algorithm %in% minimum_conditions) %>%
filter(!((Real_Alg %in% incomplete_algorithms) & Imputation), Methodology != "Only complete") %>%
filter(Methodology != "Only complete") %>%
filter(!((Missing_Percentage == 0) & (Methodology == "Whole dataset") & (Real_Alg %in% complete_algorithms))) %>%
filter(!((Missing_Percentage == 0) & (Methodology == "Mean imputation") & (Real_Alg %in% incomplete_algorithms))) %>%
filter(Amputation_Mechanism == "No" | ampt_mech == "All" | Amputation_Mechanism == ampt_mech) %>%
select(Dataset, Algorithm, Missing_Percentage, Amputation_Mechanism, Real_Alg, metric) %>%
group_by(Dataset, Algorithm, Missing_Percentage) %>%
summarise("{metric}" := mean(!!as.name(metric))) %>%
filter(Missing_Percentage > 0) %>%
ggplot(aes(x=Missing_Percentage, y=get(metric), group=Algorithm, colour=Algorithm)) +
facet_wrap(~ Dataset) +
geom_line(linewidth = 1.2) +
ylim(0, 1) +
scale_x_continuous(breaks=c(10, 25, 50, 75, 90)) +
xlab("Incomplete samples (%)") + ylab(ifelse(metric == "AMI_Stab", "Adjusted mutual information", "Adjusted rand index")) +
theme_cowplot(font_family = "Times") +
# scale_color_manual(values = algorithm_colors) +
gghighlight(Algorithm %in% interesting_algs, use_direct_label = TRUE, keep_scales = TRUE, unhighlighted_params = list(linewidth = .6), calculate_per_facet = TRUE)
ggsave(file.path(paper_figures_folder, paste0("inc_lines_indstab_", ampt_mech, "_", gsub(extraw_metrics_stab, "", metric),".png")), figure, width=12, height = 10, bg = 'white')
if ((metric == metrics_stab[1]) & (ampt_mech == "All")) {
ggsave(file.path(paper_figures_folder, paste0("inc_lines_indstab_", ampt_mech, "_", gsub(extraw_metrics_stab, "", metric),".svg")), figure, width=12, height = 10, bg = 'white')
ggsave(file.path(paper_figures_folder, paste0("inc_lines_indstab_", ampt_mech, "_", gsub(extraw_metrics_stab, "", metric),".pdf")), figure, width=12, height = 10)
}
}
}
Individual performance. Performance of each method for varying levels of incomplete samples and different missigness pattern.
Code
for (metric in metrics_stab) {
baseline_values <- stab_incomplete %>%
filter(Amputation_Mechanism == "No", Methodology == "Only complete") %>%
group_by(Dataset) %>%
summarise("{metric}" := max(!!as.name(metric))) %>%
select(Dataset, Baseline = metric)
for (ampt_mech in amputation_mechanisms) {
figure <- stab_incomplete %>%
mutate(Methodology = factor(Methodology, levels=c("Whole dataset", "Mean imputation", "Only complete"))) %>%
filter(Real_Alg %in% interesting_algs) %>%
filter(Amputation_Mechanism == "No" | ampt_mech == "All" | Amputation_Mechanism == ampt_mech) %>%
select(Dataset, Algorithm, Missing_Percentage, Amputation_Mechanism, Methodology, Real_Alg, "mean" = metric) %>%
group_by(Dataset, Algorithm, Missing_Percentage, Methodology, Real_Alg) %>%
summarise("mean" := mean(mean)) %>%
arrange(Dataset, Missing_Percentage, Algorithm, Methodology) %>%
filter(Missing_Percentage > 0) %>%
ggplot(aes(x=Missing_Percentage, y=mean, group = interaction(Real_Alg, Methodology), colour=Real_Alg, linetype = Methodology)) +
facet_wrap(~ Dataset) +
geom_line(linewidth = 1.2) +
ylim(0, 1) +
scale_linetype_manual(values=c("solid", "dotted", "twodash")) +
scale_x_continuous(breaks=c(10, 25, 50, 75, 90)) +
xlab("Incomplete samples (%)") + ylab(ifelse(metric == "AMI_Stab", "Adjusted mutual information", "Adjusted rand index")) +
theme_cowplot(font_family = "Times") +
# scale_color_manual(values = algorithm_colors) +
guides(colour= guide_legend(title="Algorithm"))
ggsave(file.path(paper_figures_folder, paste0("inc_indbasic_stab_", ampt_mech, "_", gsub(extraw_metrics_stab, "", metric),".png")), figure, width=12, height = 10, bg = 'white')
if ((metric == metrics_stab[1]) & (ampt_mech == "All")) {
ggsave(file.path(paper_figures_folder, paste0("inc_indbasic_stab_", ampt_mech, "_", gsub(extraw_metrics_stab, "", metric),".svg")), figure, width=12, height = 10, bg = 'white')
ggsave(file.path(paper_figures_folder, paste0("inc_indbasic_stab_", ampt_mech, "_", gsub(extraw_metrics_stab, "", metric),".pdf")), figure, width=12, height = 10)
}
}
}
Comparison with common simple approaches. Performance of top methods able to handle incomplete multi-modal data compared with mean imputation or excluding incomplete samples for varying levels of incomplete samples and different missigness pattern.
Code
interesting_algs <- "IMSR"
interesting_datasets <- "BBCSport"
for (metric in metrics_stab) {
for (ampt_mech in amputation_mechanisms) {
figure <- stab_incomplete %>%
mutate(Methodology = factor(Methodology, levels=c("Whole dataset", "Mean imputation", "Only complete"))) %>%
filter(Real_Alg %in% interesting_algs) %>%
filter(Amputation_Mechanism == "No" | ampt_mech == "All" | Amputation_Mechanism == ampt_mech) %>%
filter((Dataset == interesting_datasets) & (Real_Alg == interesting_algs)) %>%
group_by(Dataset, Algorithm, Missing_Percentage, Methodology, Real_Alg) %>%
summarise("mean" := mean(!!sym(metric))) %>%
filter(Missing_Percentage > 0) %>%
ggplot(aes(x=Missing_Percentage, y=mean, group = Methodology, colour=Methodology)) +
geom_line() +
ylim(0, 1) +
scale_x_continuous(breaks=c(10, 25, 50, 75, 90)) +
xlab("Incomplete samples (%)") + ylab(ifelse(metric == "AMI_Stab", "Adjusted mutual information", "Adjusted rand index")) +
theme_cowplot(font_family = "Times") +
theme(legend.position = c(0.1, 0.2), legend.text = element_text(size=18), legend.title = element_blank())
ggsave(file.path(paper_figures_folder, paste0("conf_stab_", ampt_mech, "_", gsub(extraw_metrics_stab, "", metric),".png")), figure, bg = 'white')
if ((metric == metrics_stab[1]) & (ampt_mech == "PM")) {
figure <- figure + theme(text = element_text(size=22), axis.text.x = element_text(size=18), axis.text.y = element_text(size=18))
ggsave(file.path(paper_figures_folder, paste0("conf_stab_", ampt_mech, "_", gsub(extraw_metrics_stab, "", metric),".svg")), figure, bg = 'white')
ggsave(file.path(paper_figures_folder, paste0("conf_stab_", ampt_mech, "_", gsub(extraw_metrics_stab, "", metric),".pdf")), figure)
}
}
}
Confidence interval. Confidence interval (mean\(\pm\)standard deviation) for IMSR on BBCSport dataset compared with mean imputation or excluding incomplete samples for varying levels of incomplete samples and different missigness pattern.
Code
table_show <- stab_incomplete %>%
filter(Missing_Percentage > 0) %>%
group_by(dataset_missing_pattern) %>%
mutate(across(all_of(metrics_stab), ~ rank(-.), .names ="{.col}_rank")) %>%
ungroup() %>%
rename_with(~ str_remove(., extraw_metrics_stab), ends_with("_rank")) %>%
{
metric_cols <- grep(paste0("^(", paste(gsub(extraw_metrics_stab, "", metrics_stab), collapse = "|"), ")_(Stab|rank)$"),
colnames(.), value = TRUE)
mean_cols <- grep("_Stab$", metric_cols, value = TRUE)
rank_cols <- grep("_rank$", metric_cols, value = TRUE)
ordered_cols <- c(rbind(mean_cols, rank_cols))
ordered_cols <- ordered_cols[!is.na(ordered_cols)]
select(., Dataset, Algorithm, Missing_Percentage, Amputation_Mechanism, all_of(ordered_cols))
} %>%
arrange(Dataset, Missing_Percentage, Algorithm, Amputation_Mechanism) %>%
mutate(across(where(is.numeric), \(x) round(x, digits = 2)))
reactable::reactable(table_show, pagination = TRUE, sortable = TRUE, defaultPageSize = 10)All results. Table with all results, showing mean, standard deviation and rank for each metric.
4.2.3 Robustness
Code
idx <- !(names(algorithm_colors) %in% incomplete_algorithms)
alg_colors2 <- algorithm_colors[idx]
names(alg_colors2) <- paste0("imp_", names(alg_colors2))
algorithm_colors <- c(algorithm_colors, alg_colors2)
basic_com_results <- com_rbresults %>%
mutate(Methodology = ifelse(Imputation, "Mean imputation", "Only complete"))
results_rb_incomplete <- incom_rbresults %>%
mutate("Real_Alg" = Algorithm, "Methodology" = "Whole dataset") %>%
rbind(
basic_com_results %>%
select(-Strategy) %>%
mutate("Real_Alg" = Algorithm) %>%
mutate(Algorithm = ifelse(Methodology == "Mean imputation", paste0("imp_", Algorithm), ifelse(Methodology == "Only complete", paste0("com_", Algorithm), Algorithm)))
) %>%
mutate("dataset_missing_pattern" = paste0(Dataset, Missing_Percentage, Amputation_Mechanism)) %>%
arrange(Dataset, Missing_Percentage, Algorithm, Amputation_Mechanism, Imputation)4.2.3.1 Aggregated
Code
metric <- metrics_rob[1]
results_summary_input <- results_rb_incomplete %>%
filter(!((Real_Alg %in% incomplete_algorithms) & Imputation)) %>%
filter(Methodology != "Only complete")
results_summary <- results_summary_input %>%
select(dataset_missing_pattern, Dataset, Algorithm, all_of(metric), Real_Alg) %>%
group_by(dataset_missing_pattern) %>%
mutate(rank = rank(-!!as.name(metric))) %>%
group_by(Algorithm, Real_Alg) %>%
summarise(
"# in Top 3 Models" = sum(rank <= 3),
"# Best" = sum(rank == 1),
"Average Rank" = mean(rank),
"Sd Rank" = sd(rank),
"# Datasets Best" = n_distinct(Dataset[rank == 1]),
"# Datasets Top 3" = n_distinct(Dataset[rank <= 3]),
) %>%
arrange(-`# in Top 3 Models`, -`# Best`, `Average Rank`) %>%
mutate_if(is.numeric, round, digits= 1) %>%
left_join(results_summary_input %>% filter((!(Real_Alg %in% incomplete_algorithms) & (Imputation == TRUE)) | ((Algorithm %in% incomplete_algorithms) & (Imputation == FALSE))) %>% count(Algorithm, name = "# Conditions"), by = "Algorithm") %>%
select(-c("Real_Alg"))
minimum_conditions <- 500
minimum_conditions <- filter(results_summary, `# Conditions` >= minimum_conditions)$Algorithm
reactable::reactable(results_summary, pagination = TRUE, sortable = TRUE, defaultPageSize = 10)Code
# print(xtable(results_summary[1:10,] %>% mutate("Average Rank" = paste0(round(`Average Rank`, 1), "\u00B1", round(`Sd Rank`, 1)), "# Best" = paste0(`# Best`, "(", `# Datasets Best`, ")"), "# in Top 3 Models" = paste0(`# in Top 3 Models`, "(", `# Datasets Top 3`, ")"), "Algorithm" = gsub("imp_", "(imp)", Algorithm)) %>% select(-c(`Sd Rank`, "# Datasets Best", "# Datasets Top 3"))), include.rownames=FALSE)Cluster stability summary. Summary of performance, showing the number of times each method was among the top three methods (with the number of distinct datasets indicated in parentheses), the number of times it achieved the best score (also with dataset count), the average rank (mean\(\pm\)standard deviation), and the total number of conditions in which it was evaluated.
Code
interesting_algs <- c("NEMO", "IMSR", "PIMVC")
for (metric in metrics_rob) {
baseline_values <- results_rb_incomplete %>%
filter(Missing_Percentage == 10) %>%
group_by(Dataset) %>%
summarise("{metric}" := max(!!as.name(metric))) %>%
select(Dataset, Baseline = metric)
for (ampt_mech in amputation_mechanisms) {
figure <- results_rb_incomplete %>%
filter(Algorithm %in% minimum_conditions) %>%
filter(!((Real_Alg %in% incomplete_algorithms) & Imputation), Methodology != "Only complete") %>%
filter(Methodology != "Only complete") %>%
filter(!((Missing_Percentage == 0) & (Methodology == "Whole dataset") & (Real_Alg %in% complete_algorithms))) %>%
filter(!((Missing_Percentage == 0) & (Methodology == "Mean imputation") & (Real_Alg %in% incomplete_algorithms))) %>%
filter(Amputation_Mechanism == "No" | ampt_mech == "All" | Amputation_Mechanism == ampt_mech) %>%
select(Dataset, Algorithm, Missing_Percentage, Amputation_Mechanism, Real_Alg, metric) %>%
left_join(baseline_values, by = "Dataset") %>%
mutate("{metric}" := pmax(0, pmin(!!as.name(metric) / Baseline, 1))) %>%
group_by(Algorithm, Missing_Percentage) %>%
summarise("{metric}" := mean(!!as.name(metric))) %>%
ggplot(aes(x=Missing_Percentage, y=get(metric), group=Algorithm, colour=Algorithm)) +
geom_line(linewidth = 1.2) +
ylim(0, 1) +
scale_x_continuous(breaks=c(10, 25, 50, 75, 90)) +
xlab("Incomplete samples (%)") + ylab(paste("Normalized", ifelse(metric == "AMI_mean", "adjusted mutual information", "adjusted rand index"))) +
theme_cowplot(font_family = "Times") +
# scale_color_manual(values = algorithm_colors) +
gghighlight(Algorithm %in% interesting_algs, use_direct_label = TRUE, keep_scales = TRUE, unhighlighted_params = list(linewidth = .6))
ggsave(file.path(paper_figures_folder, paste0("inc_lines_rb_", ampt_mech, "_", gsub(extraw_metrics_performance, "", metric),".png")), figure, bg = 'white')
if ((metric == metrics_rob[1]) & (ampt_mech == "All")) {
figure$layers[[4]]$aes_params$size <- 7
figure <- figure + ylab("Adjusted mutual information")
figure <- figure + theme(text = element_text(size=22), axis.text.x = element_text(size=18), axis.text.y = element_text(size=18))
ggsave(file.path(paper_figures_folder, paste0("inc_lines_rb_", ampt_mech, "_", gsub(extraw_metrics_performance, "", metric),".svg")), figure, bg = 'white')
ggsave(file.path(paper_figures_folder, paste0("inc_lines_rb_", ampt_mech, "_", gsub(extraw_metrics_performance, "", metric),".pdf")), figure)
}
}
}
Aggregated performance. Average performance of each method across datasets for varying levels of incomplete samples and different missigness pattern. To ensure a consistent aggregation across dataset, all scores were normalized to a common scale prior to averaging.
Code
for (metric in metrics_rob) {
baseline_values <- results_rb_incomplete %>%
filter(Missing_Percentage == 10) %>%
group_by(Dataset) %>%
summarise("{metric}" := max(!!as.name(metric))) %>%
select(Dataset, Baseline = metric)
for (ampt_mech in amputation_mechanisms) {
figure <- results_rb_incomplete %>%
mutate(Methodology = factor(Methodology, levels=c("Whole dataset", "Mean imputation", "Only complete"))) %>%
filter(Real_Alg %in% interesting_algs) %>%
filter(Amputation_Mechanism == "No" | ampt_mech == "All" | Amputation_Mechanism == ampt_mech) %>%
select(Dataset, Algorithm, Missing_Percentage, Amputation_Mechanism, Methodology, Real_Alg, metric) %>%
left_join(baseline_values, by = "Dataset") %>%
mutate("{metric}" := pmax(0, pmin(!!as.name(metric) / Baseline, 1))) %>%
group_by(Algorithm, Missing_Percentage, Methodology, Real_Alg) %>%
summarise("{metric}" := mean(!!as.name(metric))) %>%
arrange(Missing_Percentage, Algorithm, Methodology) %>%
ggplot(aes(x=Missing_Percentage, y=get(metric), group = interaction(Real_Alg, Methodology), colour=Real_Alg, linetype = Methodology)) +
geom_line(linewidth = 1.2) +
ylim(0, 1) +
scale_linetype_manual(values=c("solid", "dotted", "twodash")) +
scale_x_continuous(breaks=c(10, 25, 50, 75, 90)) +
xlab("Incomplete samples (%)") + ylab(paste("Normalized", ifelse(metric == "AMI_mean", "adjusted mutual information", "adjusted rand index"))) +
theme_cowplot() +
# scale_color_manual(values = algorithm_colors) +
theme(legend.position = c(0.05, 0.13), legend.box = "horizontal", text = element_text(family = "Times")) +
guides(colour= guide_legend(title="Algorithm"))
ggsave(file.path(paper_figures_folder, paste0("inc_basic_rb_", ampt_mech, "_", gsub(extraw_metrics_performance, "", metric),".png")), figure, bg = 'white')
if ((metric == metrics_rob[1]) & (ampt_mech == "All")) {
ggsave(file.path(paper_figures_folder, paste0("inc_basic_rb_", ampt_mech, "_", gsub(extraw_metrics_performance, "", metric),".svg")), figure, bg = 'white')
ggsave(file.path(paper_figures_folder, paste0("inc_basic_rb_", ampt_mech, "_", gsub(extraw_metrics_performance, "", metric),".pdf")), figure)
}
}
}
Comparison with common simple approaches. Average performance of top methods able to handle incomplete multi-modal data across datasets compared with mean imputation or excluding incomplete samples for varying levels of incomplete samples and different missigness pattern. To ensure a consistent aggregation across dataset, all scores were normalized to a common scale prior to averaging.
4.2.3.2 Individual
Code
for (metric in metrics_rob) {
baseline_values <- results_rb_incomplete %>%
filter(Missing_Percentage == 10) %>%
group_by(Dataset) %>%
summarise("{metric}" := max(!!as.name(metric))) %>%
select(Dataset, Baseline = metric)
for (ampt_mech in amputation_mechanisms) {
figure <- results_rb_incomplete %>%
filter(Algorithm %in% minimum_conditions) %>%
filter(!((Real_Alg %in% incomplete_algorithms) & Imputation), Methodology != "Only complete") %>%
filter(Methodology != "Only complete") %>%
filter(!((Missing_Percentage == 0) & (Methodology == "Whole dataset") & (Real_Alg %in% complete_algorithms))) %>%
filter(!((Missing_Percentage == 0) & (Methodology == "Mean imputation") & (Real_Alg %in% incomplete_algorithms))) %>%
filter(Amputation_Mechanism == "No" | ampt_mech == "All" | Amputation_Mechanism == ampt_mech) %>%
select(Dataset, Algorithm, Missing_Percentage, Amputation_Mechanism, Real_Alg, metric) %>%
group_by(Dataset, Algorithm, Missing_Percentage) %>%
summarise("{metric}" := mean(!!as.name(metric))) %>%
ggplot(aes(x=Missing_Percentage, y=get(metric), group=Algorithm, colour=Algorithm)) +
facet_wrap(~ Dataset) +
geom_line(linewidth = 1.2) +
ylim(0, 1) +
scale_x_continuous(breaks=c(10, 25, 50, 75, 90)) +
xlab("Incomplete samples (%)") + ylab(ifelse(metric == "AMI_mean", "Adjusted mutual information", "Adjusted rand index")) +
theme_cowplot(font_family = "Times") +
# scale_color_manual(values = algorithm_colors) +
gghighlight(Algorithm %in% interesting_algs, use_direct_label = TRUE, keep_scales = TRUE, unhighlighted_params = list(linewidth = .6), calculate_per_facet = TRUE)
ggsave(file.path(paper_figures_folder, paste0("inc_lines_indrb_", ampt_mech, "_", gsub(extraw_metrics_performance, "", metric),".png")), figure, width=12, height = 10, bg = 'white')
if ((metric == metrics_rob[1]) & (ampt_mech == "All")) {
ggsave(file.path(paper_figures_folder, paste0("inc_lines_indrb_", ampt_mech, "_", gsub(extraw_metrics_performance, "", metric),".svg")), figure, width=12, height = 10, bg = 'white')
ggsave(file.path(paper_figures_folder, paste0("inc_lines_indrb_", ampt_mech, "_", gsub(extraw_metrics_performance, "", metric),".pdf")), figure, width=12, height = 10)
}
}
}
Individual performance. Performance of each method for varying levels of incomplete samples and different missigness pattern.
Code
for (metric in metrics_rob) {
baseline_values <- results_rb_incomplete %>%
filter(Missing_Percentage == 10) %>%
group_by(Dataset) %>%
summarise("{metric}" := max(!!as.name(metric))) %>%
select(Dataset, Baseline = metric)
for (ampt_mech in amputation_mechanisms) {
figure <- results_rb_incomplete %>%
mutate(Methodology = factor(Methodology, levels=c("Whole dataset", "Mean imputation", "Only complete"))) %>%
filter(Real_Alg %in% interesting_algs) %>%
filter(Amputation_Mechanism == "No" | ampt_mech == "All" | Amputation_Mechanism == ampt_mech) %>%
select(Dataset, Algorithm, Missing_Percentage, Amputation_Mechanism, Methodology, Real_Alg, "mean" = metric, "sd"= paste0(gsub(extraw_metrics_performance, "", metric), "_std")) %>%
group_by(Dataset, Algorithm, Missing_Percentage, Methodology, Real_Alg) %>%
summarise("sd" = mean(sd), "mean" := mean(mean)) %>%
arrange(Dataset, Missing_Percentage, Algorithm, Methodology) %>%
ggplot(aes(x=Missing_Percentage, y=mean, group = interaction(Real_Alg, Methodology), colour=Real_Alg, linetype = Methodology)) +
facet_wrap(~ Dataset) +
geom_line(linewidth = 1.2) +
ylim(0, 1) +
scale_linetype_manual(values=c("solid", "dotted", "twodash")) +
scale_x_continuous(breaks=c(10, 25, 50, 75, 90)) +
xlab("Incomplete samples (%)") + ylab(ifelse(metric == "AMI_mean", "Adjusted mutual information", "Adjusted rand index")) +
theme_cowplot(font_family = "Times") +
# scale_color_manual(values = algorithm_colors) +
guides(colour= guide_legend(title="Algorithm"))
ggsave(file.path(paper_figures_folder, paste0("inc_basic_indrb_", ampt_mech, "_", gsub(extraw_metrics_performance, "", metric),".png")), figure, width=12, height = 10, bg = 'white')
if ((metric == metrics_rob[1]) & (ampt_mech == "All")) {
ggsave(file.path(paper_figures_folder, paste0("inc_basic_indrb_", ampt_mech, "_", gsub(extraw_metrics_performance, "", metric),".svg")), figure, width=12, height = 10, bg = 'white')
ggsave(file.path(paper_figures_folder, paste0("inc_basic_indrb_", ampt_mech, "_", gsub(extraw_metrics_performance, "", metric),".pdf")), figure, width=12, height = 10)
}
if ((metric == metrics_rob[1]) & (ampt_mech == "PM")) {
figure$data <- figure$data[(figure$data$Real_Alg %in% c("NEMO", "PIMVC")) & (figure$data$Methodology %in% c("Whole dataset", "Only complete")) & (figure$data$Missing_Percentage <= 80),]
figure2 <- figure +
scale_linetype_manual(values=c("solid", "twodash")) +
theme(strip.text.x = element_blank())
figure2$data <- figure2$data[figure$data$Dataset == "BUAA",]
figure2 <- figure2 +
theme(legend.position = c(0.7, 0.3), legend.text = element_text(size=12), legend.title = element_text(size=12))
ggsave(file.path(paper_figures_folder, paste0("inc_basic_indrb_exp_", figure2$data$Dataset[1], "_", ampt_mech, "_", gsub(extraw_metrics_performance, "", metric),".svg")), figure2, bg = 'white')
ggsave(file.path(paper_figures_folder, paste0("inc_basic_indrb_exp_", figure2$data$Dataset[1], "_", ampt_mech, "_", gsub(extraw_metrics_performance, "", metric),".pdf")), figure2)
figure2 <- figure +
scale_linetype_manual(values=c("solid", "twodash")) +
theme(strip.text.x = element_blank())
figure2$data <- figure2$data[figure$data$Dataset == "TCGA",]
figure2 <- figure2 +
theme(legend.position = "none")
ggsave(file.path(paper_figures_folder, paste0("inc_basic_indrb_exp_", figure2$data$Dataset[1], "_", ampt_mech, "_", gsub(extraw_metrics_performance, "", metric),".svg")), figure2, bg = 'white')
ggsave(file.path(paper_figures_folder, paste0("inc_basic_indrb_exp_", figure2$data$Dataset[1], "_", ampt_mech, "_", gsub(extraw_metrics_performance, "", metric),".pdf")), figure2)
}
}
}
Comparison with common simple approaches. Performance of top methods able to handle incomplete multi-modal data compared with mean imputation or excluding incomplete samples for varying levels of incomplete samples and different missigness pattern.
Code
interesting_algs <- "IMSR"
interesting_datasets <- "BBCSport"
for (metric in metrics_rob) {
for (ampt_mech in amputation_mechanisms) {
figure <- results_rb_incomplete %>%
mutate(Methodology = factor(Methodology, levels=c("Whole dataset", "Mean imputation", "Only complete"))) %>%
filter(Real_Alg %in% interesting_algs) %>%
filter(Amputation_Mechanism == "No" | ampt_mech == "All" | Amputation_Mechanism == ampt_mech) %>%
select(Dataset, Algorithm, Missing_Percentage, Amputation_Mechanism, Methodology, Real_Alg, "mean" = metric, "sd"= paste0(gsub(extraw_metrics_performance, "", metric), "_std")) %>%
filter((Dataset == interesting_datasets) & (Real_Alg == interesting_algs)) %>%
group_by(Dataset, Algorithm, Missing_Percentage, Methodology, Real_Alg) %>%
summarise("sd" = mean(sd), "mean" := mean(mean)) %>%
ggplot(aes(x=Missing_Percentage, y=mean, group = Methodology, colour=Methodology)) +
geom_line() +
ylim(0, 1) +
geom_ribbon(aes(ymax = mean + sd, ymin= mean - sd, fill= Methodology, x=Missing_Percentage), alpha = 0.3) +
scale_x_continuous(breaks=c(10, 25, 50, 75, 90)) +
xlab("Incomplete samples (%)") + ylab(ifelse(metric == "AMI_mean", "Adjusted mutual information", "Adjusted rand index")) +
theme_cowplot(font_family = "Times") +
theme(legend.position = c(0.1, 0.2), legend.text = element_text(size=18), legend.title = element_blank())
ggsave(file.path(paper_figures_folder, paste0("conf_rb_", ampt_mech, "_", gsub(extraw_metrics_performance, "", metric),".png")), figure, bg = 'white')
if ((metric == metrics_rob[1]) & (ampt_mech == "PM")) {
figure <- figure + theme(text = element_text(size=22), axis.text.x = element_text(size=18), axis.text.y = element_text(size=18))
ggsave(file.path(paper_figures_folder, paste0("conf_rb_", ampt_mech, "_", gsub(extraw_metrics_performance, "", metric),".svg")), figure, bg = 'white')
ggsave(file.path(paper_figures_folder, paste0("conf_rb_", ampt_mech, "_", gsub(extraw_metrics_performance, "", metric),".pdf")), figure)
}
}
}
Confidence interval. Confidence interval (mean\(\pm\)standard deviation) for IMSR on BBCSport dataset compared with mean imputation or excluding incomplete samples for varying levels of incomplete samples and different missigness pattern.
Code
table_show <- results_rb_incomplete %>%
group_by(dataset_missing_pattern) %>%
mutate(across(all_of(metrics_rob), ~ rank(-.), .names ="{.col}_rank")) %>%
ungroup() %>%
rename_with(~ str_remove(., extraw_metrics_performance), ends_with("_rank")) %>%
{
metric_cols <- grep(paste0("^(", paste(gsub(extraw_metrics_performance, "", metrics_rob), collapse = "|"), ")_(mean|std|rank)$"),
colnames(.), value = TRUE)
mean_cols <- grep("_mean$", metric_cols, value = TRUE)
std_cols <- grep("_std$", metric_cols, value = TRUE)
rank_cols <- grep("_rank$", metric_cols, value = TRUE)
ordered_cols <- c(rbind(mean_cols, std_cols, rank_cols))
ordered_cols <- ordered_cols[!is.na(ordered_cols)]
select(., Dataset, Algorithm, Missing_Percentage, Amputation_Mechanism, all_of(ordered_cols))
} %>%
arrange(Dataset, Missing_Percentage, Algorithm, Amputation_Mechanism) %>%
mutate(across(where(is.numeric), \(x) round(x, digits = 2)))
reactable::reactable(table_show, pagination = TRUE, sortable = TRUE, defaultPageSize = 10)All results. Table with all results, showing mean, standard deviation and rank for each metric.
5 Experiments
We firstly use fully observed data, applying 25 independent runs with 80% samples using different random seeds (for both the selection of the samples and the algorithm initialization). For the incomplete data scenario, we simulated modality-wise missing data using four different missingness patterns: missing completely at random (MCAR, random missing modalities), missing not at random (MNAR, specific samples had a similar missing modality pattern), partial missing (PM, some modalities were fully observed, while others are partially missing), and mutually exclusive missing (MEM, incomplete samples had only one modality). Each algorithm was tested in each dataset across the different missingness patters and various missingness levels ranging from 10% to 90% in 10% intervals, with 25 independent runs per condition using different random seeds. Methods not capable of handling missing data directly were applied after prior mean imputation.
Experiments. Experimental workflow designed to address the research questions explored in this project, in which we conducted a systematic and empirical benchmarking study on both fully observed and modality-wise missing multi-modal datasets. Over 1M clustering models were generated and evaluated from multiple perspectives (ground-truth label matching, cluster stability, robustness to missing data, cluster structures and computational efficiency) using diverse robust metrics.
5.1 Datasets
Code
reactable::reactable(dataset_table, pagination = TRUE, sortable = TRUE, defaultPageSize = 10)Code
# print(xtable(dataset_table), include.rownames=FALSE)6 Abbreviations
Matthews correlation coefficient: MCC
Adjusted mutual information: AMI
Adjusted rand index: ARI
Variance ratio criterion: VRC
Density-based clustering validation index: DBCVI
Duda hart index: DHI
Sum of squared error index: SSEI
R-squared index: RSI
Davies-Bouldin index: DB
Ball hall index: BHI
Missing completely at random: MCAR
Missing not at random: MNAR
Partial missing: PM
Mutually exclusive missing: MEM
Angle-based Joint and Individual Variation Explained: AJIVE
Co-Regularized Multiview Spectral Clustering: MVCRSC
Cluster-Of-Clusters Analysis: COCA
Data Fusion by Matrix Factorization: DFMF
Doubly Aligned Incomplete Multi-view Clustering: DAIMC
Efficient and Effective Incomplete Multi-view Clustering: EEIMVC
Group Principal Component Analysis: GPCA
Incomplete Multiview Spectral Clustering With Adaptive Graph Learning: IMSCAGL
Integrative Non-negative Matrix Factorization: iNMF
Self-representation Subspace Clustering for Incomplete Multi-view Data: IMSR
Joint Non-negative Matrix Factorization Algorithms: jNMF
Late Fusion Incomplete Multi-View Clustering: LFIMVC
Multiple Kernel k-Means with Incomplete Kernels: MKKMIK
Multi-Omics Factor Analysis: MOFA
Multi Omic clustering by Non-Exhaustive Types: MONET
Multi-Reconstruction Graph Convolutional Network: MRGCN
Multiple Similarity Network Embedding: MSNE
Multiview Spectral Clustering: MVSC
NEighborhood based Multi-Omics clustering: NEMO
Non-Negative Matrix Factorization: NMF
Online multi-view clustering: OMVC
One-Pass Incomplete Multi-View Clustering: OPIMC
One-Stage Incomplete Multi-view Clustering via Late Fusion: OSLFIMVC
Projective Incomplete Multi-View Clustering: PIMVC
Scalable Incomplete Multiview Clustering with Adaptive Data Completion: SIMCADC
Similarity Network Fusion: SNF
Subtyping Tool for Multi-Omic data: SUMO
7 Environment
Code
sessionInfo()R version 4.1.2 (2021-11-01)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 22.04.3 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=nb_NO.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=nb_NO.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=nb_NO.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=nb_NO.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] grid stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] stringr_1.5.1 rlang_1.1.3 reshape2_1.4.4
[4] ggrepel_0.9.6 tibble_3.2.1 Rtsne_0.17
[7] caTools_1.18.3 xtable_1.8-4 ggraph_2.2.1
[10] igraph_2.1.4 PMCMRplus_1.9.12 svglite_2.1.3
[13] reactable_0.4.4 htmlwidgets_1.6.4 plotly_4.10.4
[16] corrplot_0.95 ggiraphExtra_0.3.0 cowplot_1.1.3
[19] readr_2.1.5 tidyr_1.3.1 ggdist_3.3.2
[22] pals_1.9 mlr3benchmark_0.1.7 gghighlight_0.4.1
[25] dplyr_1.1.4 ggplot2_3.5.1
loaded via a namespace (and not attached):
[1] nlme_3.1-155 bitops_1.0-9 bit64_4.6.0-1
[4] insight_1.0.1 RColorBrewer_1.1-3 httr_1.4.7
[7] tools_4.1.2 backports_1.5.0 utf8_1.2.4
[10] R6_2.5.1 sjlabelled_1.2.0 lazyeval_0.2.2
[13] mgcv_1.8-39 colorspace_2.1-0 withr_3.0.0
[16] mlr3misc_0.16.0 gridExtra_2.3 tidyselect_1.2.1
[19] extrafontdb_1.0 bit_4.5.0.1 compiler_4.1.2
[22] cli_3.6.2 labeling_0.4.3 BWStest_0.2.3
[25] scales_1.3.0 checkmate_2.3.2 mvtnorm_1.3-3
[28] multcompView_0.1-10 systemfonts_1.2.1 digest_0.6.35
[31] rmarkdown_2.29 dichromat_2.0-0.1 pkgconfig_2.0.3
[34] htmltools_0.5.8.1 extrafont_0.19 fastmap_1.2.0
[37] maps_3.4.2 SuppDists_1.1-9.8 generics_0.1.3
[40] farver_2.1.1 jsonlite_1.8.8 crosstalk_1.2.1
[43] vroom_1.6.5 distributional_0.5.0 magrittr_2.0.3
[46] Matrix_1.4-0 Rcpp_1.0.12 munsell_0.5.0
[49] fansi_1.0.6 viridis_0.6.5 lifecycle_1.0.4
[52] stringi_1.8.3 yaml_2.3.10 MASS_7.3-55
[55] plyr_1.8.9 parallel_4.1.2 crayon_1.5.3
[58] sjmisc_2.8.10 ppcor_1.1 lattice_0.20-45
[61] graphlayouts_1.2.2 splines_4.1.2 mapproj_1.2.11
[64] hms_1.1.3 knitr_1.49 pillar_1.9.0
[67] uuid_1.2-1 kSamples_1.2-10 glue_1.7.0
[70] ggiraph_0.8.12 evaluate_1.0.3 mycor_0.1.1
[73] data.table_1.15.4 vctrs_0.6.5 tzdb_0.4.0
[76] tweenr_2.0.3 Rttf2pt1_1.3.12 gtable_0.3.4
[79] purrr_1.0.2 polyclip_1.10-7 reactR_0.6.1
[82] cachem_1.1.0 xfun_0.50 ggforce_0.4.2
[85] tidygraph_1.3.1 Rmpfr_1.0-0 viridisLite_0.4.2
[88] memoise_2.0.1 gmp_0.7-5