Skip to contents

This vignette walks through the complete MOSAIC pipeline using simulated data only — no external datasets required. After installing MOSAIC, you can run every code block below to see the framework in action.

We will:

  1. Generate synthetic multi-modal data with known differential connectivity (DC) signal
  2. Run the MOSAIC embedding
  3. Perform differential connectivity analysis
  4. Detect patient subgroups from feature modules

Step 1: Generate Simulated Data

We simulate a two-modality dataset with 20 samples (10 per condition), where 20% of features undergo connectivity rewiring between conditions A and B.

library(MOSAIC)
library(ggplot2)

sim <- simulate_multimodal_data(
  simulation_type = "DC",
  n_features = c(500, 400),
  signal_prop = 0.2,
  seed = 42,
  rescale = TRUE,
  add_batch = FALSE,
  nonlinear = FALSE,
  sample_noise_level = 0.5,
  feature_noise_level = 0.5,
  cell_noise_level = 0.5
)

cat("Modalities:", length(sim$seurat_list), "\n")
#> Modalities: 2
cat("Samples:", nrow(sim$sample_metadata), "\n")
#> Samples: 20
cat("Features per modality:", sapply(sim$seurat_list, nrow), "\n")
#> Features per modality: 500 400

The returned Seurat objects include ground-truth labels: feature_cluster (which module a feature belongs to) and is_de (whether the feature was rewired).

# Ground truth for modality 1
feat_meta <- sim$seurat_list[[1]][["originalexp"]][[]]
cat("DC features in modality 1:", sum(feat_meta$is_de), "/", nrow(feat_meta), "\n")
#> DC features in modality 1: 100 / 500

Step 2: Run MOSAIC Embedding

result <- run_MOSAIC(
  sim$seurat_list,
  assays = c("originalexp", "originalexp"),
  sample_meta = "sample_id",
  condition_meta = "condition"
)
#> MOSAIC: 2 modality/modalities, 20 samples detected
#> Processing sample 1 / 20
#> Processing sample 2 / 20
#> Processing sample 3 / 20
#> Processing sample 4 / 20
#> Processing sample 5 / 20
#> Processing sample 6 / 20
#> Processing sample 7 / 20
#> Processing sample 8 / 20
#> Processing sample 9 / 20
#> Processing sample 10 / 20
#> Processing sample 11 / 20
#> Processing sample 12 / 20
#> Processing sample 13 / 20
#> Processing sample 14 / 20
#> Processing sample 15 / 20
#> Processing sample 16 / 20
#> Processing sample 17 / 20
#> Processing sample 18 / 20
#> Processing sample 19 / 20
#> Processing sample 20 / 20
#> Per-sample rank (median): 12
#> Global rank: 13
#> Done.

Visualize the Eigenvalue Spectrum

The eigenvalues from the aggregated spectral decomposition reveal the effective dimensionality. The red line marks the automatically selected elbow.

elbow <- find_elbow_kneedle(result$eigenvalues)

plot_eigen(result$eigenvalues) +
  geom_vline(xintercept = elbow, color = "red", linetype = "dashed") +
  ggtitle(paste0("Eigenvalue spectrum (elbow = ", elbow, ")"))

Step 3: Differential Connectivity Analysis

3a. Run DC test on real labels

n_sample <- length(result$mosaic_embed_list)

dc_result <- run_DC_test(
  result$mosaic_embed_list,
  n_sample = n_sample,
  groups = result$annotation$Condition
)

3b. Run DC test on shuffled labels (null distribution)

We shuffle cell-to-sample assignments to destroy condition-specific signals, re-run MOSAIC, and collect null F-statistics.

# Shuffle sample labels
set.seed(123)
shuffled_idx <- sample(ncol(sim$seurat_list[[1]]))

for (k in seq_along(sim$seurat_list)) {
  sim$seurat_list[[k]]$shuffled_sample <- sim$seurat_list[[k]]$sample_id[shuffled_idx]
  sim$seurat_list[[k]]$shuffled_condition <- sim$seurat_list[[k]]$condition[shuffled_idx]
}

shuffle_mosaic <- run_MOSAIC(
  sim$seurat_list,
  assays = c("originalexp", "originalexp"),
  sample_meta = "shuffled_sample",
  condition_meta = "shuffled_condition",
  verbose = FALSE
)

shuffle_dc <- run_DC_test(
  shuffle_mosaic$mosaic_embed_list,
  n_sample = n_sample,
  groups = shuffle_mosaic$annotation$Condition
)

3c. Compute empirical p-values

F_obs <- unlist(dc_result$F_stats_list)
F_null <- unlist(shuffle_dc$F_stats_list)

pvalues <- sapply(F_obs, function(x) calculate_empirical_pvalue(x, F_null))

dc_idx <- which(pvalues < 0.05)
cat(length(dc_idx), "DC features detected out of", length(pvalues), "total\n")
#> 44 DC features detected out of 900 total

3d. Compare with ground truth

# Ground truth: combine is_de from both modalities
ground_truth <- c(
  sim$seurat_list[[1]][["originalexp"]][[]]$is_de,
  sim$seurat_list[[2]][["originalexp"]][[]]$is_de
)

true_dc <- which(ground_truth)
detected_dc <- dc_idx

precision <- length(intersect(detected_dc, true_dc)) / length(detected_dc)
recall <- length(intersect(detected_dc, true_dc)) / length(true_dc)

cat(sprintf("Precision: %.3f\n", precision))
#> Precision: 1.000
cat(sprintf("Recall: %.3f\n", recall))
#> Recall: 0.244

3e. Visualize a DC feature

# Pick a true DC feature
chosen_idx <- true_dc[1]

sim_mat <- dc_result$similarity_matrix_list[[chosen_idx]]
groups <- dc_result$group_list[[chosen_idx]]

plot_mds_cluster(
  sim_mat,
  title = paste("Feature", chosen_idx, "(true DC)"),
  cluster = groups,
  custom_colors = c("A" = "#56B4E9", "B" = "#E69F00"),
  size = 2,
  ellipse_level = 0.68
)

Step 4: Unsupervised Subgroup Detection

Even without using condition labels, MOSAIC can discover patient subgroups by identifying feature modules whose connectivity patterns partition samples into distinct groups.

4a. Compute feature stratification profiles

# Use the similarity matrices from DC test (already computed)
sim_vectors <- lapply(dc_result$similarity_matrix_list, function(mat) {
  mat[upper.tri(mat, diag = FALSE)]
})

strat_matrix <- do.call(cbind, sim_vectors)
feature_corr <- cor(strat_matrix, method = "spearman")

4b. Identify feature modules

library(dynamicTreeCut)

dist_feat <- as.dist(1 - feature_corr)
hc_feat <- hclust(dist_feat, method = "ward.D2")

feature_clusters <- cutreeDynamic(
  dendro = hc_feat,
  distM = as.matrix(dist_feat),
  minClusterSize = 15,
  method = "hybrid",
  deepSplit = 2
)
#>  ..cutHeight not given, setting it to 8.71  ===>  99% of the (truncated) height range in dendro.
#>  ..done.
names(feature_clusters) <- colnames(feature_corr)

module_ids <- sort(unique(feature_clusters[feature_clusters > 0]))
cat(length(module_ids), "modules identified\n")
#> 23 modules identified
cat("Module sizes:", sapply(module_ids, function(m) sum(feature_clusters == m)), "\n")
#> Module sizes: 106 98 68 42 41 40 39 37 37 37 36 35 33 32 31 30 27 27 26 23 20 18 17

4c. Screen modules for subgroups

total_features <- nrow(result$mosaic_embed_list[[1]])

screen_results <- data.frame(
  module = integer(), n_features = integer(),
  silhouette = numeric(), balanced = logical(),
  stringsAsFactors = FALSE
)

for (m in module_ids) {
  idx <- which(feature_clusters == m)
  sim_mat <- compute_module_similarity(result$mosaic_embed_list, idx)
  partition <- find_partition_hclust(as.dist(1 - sim_mat))

  screen_results <- rbind(screen_results, data.frame(
    module = m,
    n_features = length(idx),
    silhouette = ifelse(partition$balanced, partition$silhouette, NA),
    balanced = partition$balanced
  ))
}

screen_results <- screen_results[order(-screen_results$silhouette), ]
print(screen_results)
#>    module n_features silhouette balanced
#> 1       1        106  0.9462005     TRUE
#> 2       2         98  0.9144864     TRUE
#> 5       5         41  0.5063513     TRUE
#> 3       3         68         NA    FALSE
#> 4       4         42         NA    FALSE
#> 6       6         40         NA    FALSE
#> 7       7         39         NA    FALSE
#> 8       8         37         NA    FALSE
#> 9       9         37         NA    FALSE
#> 10     10         37         NA    FALSE
#> 11     11         36         NA    FALSE
#> 12     12         35         NA    FALSE
#> 13     13         33         NA    FALSE
#> 14     14         32         NA    FALSE
#> 15     15         31         NA    FALSE
#> 16     16         30         NA    FALSE
#> 17     17         27         NA    FALSE
#> 18     18         27         NA    FALSE
#> 19     19         26         NA    FALSE
#> 20     20         23         NA    FALSE
#> 21     21         20         NA    FALSE
#> 22     22         18         NA    FALSE
#> 23     23         17         NA    FALSE

4d. Visualize the best module’s heatmap

library(pheatmap)

best_module <- screen_results$module[which.max(screen_results$silhouette)]
best_idx <- which(feature_clusters == best_module)

sim_mat <- compute_module_similarity(result$mosaic_embed_list, best_idx)
partition <- find_partition_hclust(as.dist(1 - sim_mat))

annotation <- result$annotation
annotation$Subgroup <- ifelse(partition$groups == 1, "Group1", "Group2")

annotation_colors <- list(
  Condition = c("A" = "#56B4E9", "B" = "#E69F00"),
  Subgroup = c("Group1" = "#4DBBD5", "Group2" = "#E64B35")
)

pheatmap(
  sim_mat,
  annotation_row = annotation,
  annotation_col = annotation,
  annotation_colors = annotation_colors,
  cluster_rows = partition$hclust,
  cluster_cols = partition$hclust,
  show_rownames = TRUE,
  show_colnames = FALSE,
  main = sprintf("Module %d | %d features | Silhouette = %.3f",
                 best_module, length(best_idx), partition$silhouette)
)


cat("\nSubgroup x Condition:\n")
#> 
#> Subgroup x Condition:
print(table(annotation$Subgroup, annotation$Condition))
#>         
#>           A  B
#>   Group1 10  0
#>   Group2  0 10

If the simulation is working correctly, the discovered subgroups should align closely with the true conditions A and B, since the DC signal creates condition-specific connectivity patterns that MOSAIC’s modular approach can recover in an unsupervised manner.

Session Info

sessionInfo()
#> R version 4.1.0 (2021-05-18)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Ubuntu 20.04.2 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] pheatmap_1.0.12       dynamicTreeCut_1.63-1 ggplot2_3.5.1        
#> [4] MOSAIC_1.0.0         
#> 
#> loaded via a namespace (and not attached):
#>   [1] Seurat_5.2.1           Rtsne_0.17             colorspace_2.1-1      
#>   [4] deldir_2.0-4           ggridges_0.5.6         RcppHNSW_0.6.0        
#>   [7] fs_1.6.5               spatstat.data_3.1-4    listenv_0.9.1         
#>  [10] farver_2.1.2           ggrepel_0.9.6          RSpectra_0.16-2       
#>  [13] codetools_0.2-18       splines_4.1.0          doParallel_1.0.17     
#>  [16] cachem_1.1.0           knitr_1.49             polyclip_1.10-7       
#>  [19] spam_2.11-1            jsonlite_1.8.9         ica_1.0-3             
#>  [22] cluster_2.1.2          png_0.1-8              uwot_0.2.2            
#>  [25] spatstat.sparse_3.1-0  shiny_1.10.0           sctransform_0.4.1     
#>  [28] compiler_4.1.0         httr_1.4.7             SeuratObject_5.0.2    
#>  [31] Matrix_1.6-5           fastmap_1.2.0          lazyeval_0.2.2        
#>  [34] cli_3.6.3              later_1.4.1            htmltools_0.5.8.1     
#>  [37] tools_4.1.0            igraph_2.1.4           dotCall64_1.2         
#>  [40] gtable_0.3.6           glue_1.8.0             RANN_2.6.2            
#>  [43] reshape2_1.4.4         dplyr_1.1.4            Rcpp_1.0.14           
#>  [46] scattermore_1.2        spatstat.univar_3.1-1  jquerylib_0.1.4       
#>  [49] pkgdown_2.1.1          vctrs_0.6.5            nlme_3.1-152          
#>  [52] spatstat.explore_3.3-4 progressr_0.15.1       iterators_1.0.14      
#>  [55] lmtest_0.9-40          spatstat.random_3.3-2  xfun_0.50             
#>  [58] stringr_1.5.1          globals_0.16.3         mime_0.12             
#>  [61] miniUI_0.1.1.1         lifecycle_1.0.4        irlba_2.3.5.1         
#>  [64] goftest_1.2-3          future_1.34.0          MASS_7.3-54           
#>  [67] zoo_1.8-12             scales_1.3.0           spatstat.utils_3.1-2  
#>  [70] ragg_1.3.3             promises_1.3.2         parallel_4.1.0        
#>  [73] RColorBrewer_1.1-3     yaml_2.3.10            reticulate_1.40.0     
#>  [76] pbapply_1.7-2          gridExtra_2.3          sass_0.4.9            
#>  [79] stringi_1.8.4          desc_1.4.3             fastDummies_1.7.5     
#>  [82] foreach_1.5.2          permute_0.9-7          rlang_1.1.5           
#>  [85] pkgconfig_2.0.3        systemfonts_1.2.1      matrixStats_1.5.0     
#>  [88] evaluate_1.0.3         lattice_0.20-44        tensor_1.5            
#>  [91] ROCR_1.0-11            purrr_1.0.2            labeling_0.4.3        
#>  [94] patchwork_1.3.0        htmlwidgets_1.6.4      cowplot_1.1.3         
#>  [97] tidyselect_1.2.1       parallelly_1.41.0      RcppAnnoy_0.0.22      
#> [100] plyr_1.8.9             magrittr_2.0.3         R6_2.5.1              
#> [103] generics_0.1.3         withr_3.0.2            mgcv_1.8-36           
#> [106] pillar_1.10.1          fitdistrplus_1.2-2     abind_1.4-8           
#> [109] survival_3.2-11        sp_2.1-4               tibble_3.2.1          
#> [112] future.apply_1.11.3    KernSmooth_2.23-20     spatstat.geom_3.3-5   
#> [115] plotly_4.10.4          rmarkdown_2.29         grid_4.1.0            
#> [118] data.table_1.16.4      vegan_2.6-8            digest_0.6.37         
#> [121] xtable_1.8-4           tidyr_1.3.1            httpuv_1.6.15         
#> [124] textshaping_1.0.0      munsell_0.5.1          viridisLite_0.4.2     
#> [127] bslib_0.8.0