BPCells FindMarkers

A reproducible example to compare finding top markers between Seuratv5 and BPCells

libraries

library(tidyverse)
library(Seurat)
library(BPCells)
library(Azimuth)
library(SeuratObject)
library(presto)
library(SeuratWrappers)

options(Seurat.object.assay.version = "v5")

Download data

url <- "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSM6176195&format=file&file=GSM6176195%5Ffiltered%5Ffeature%5Fbc%5Fmatrix%5Fp1%5Fbiopsy1%5Frna%2Eh5"
# Only download files if we haven't downloaded already
if (!file.exists("pcnsl.h5")) {
  download.file(url, "pcnsl.h5", mode="wb")
}

data loading

# Check if we already ran import
if (!file.exists("pcnsl_raw")) {
  mat_raw <- open_matrix_10x_hdf5("pcnsl.h5", feature_type="Gene Expression") %>% 
    write_matrix_dir("pcnsl_raw")
} else {
  mat_raw <- open_matrix_dir("pcnsl_raw")
}

create seurat object

mat_raw <- Azimuth:::ConvertEnsembleToSymbol(mat = mat_raw, species = "human")
seu <- CreateSeuratObject(counts = mat_raw)

Seurat pipeline

seu <- NormalizeData(seu)
seu <- FindVariableFeatures(seu)
seu <- ScaleData(seu)
seu <- RunPCA(seu)
seu <- FindNeighbors(seu)
seu <- FindClusters(seu, resolution = 0.8)
seu <- RunUMAP(seu, reduction = "pca", dims = 1:30)

BPCells markers

no need to convert and very fast

system.time(
    bp_markers <- BPCells::marker_features(seu$RNA$data, Idents(seu))
)
Warning: marker features calculation requires row-major storage
• Consider using transpose_storage_order() if running marker_features repeatedly
Writing transposed storage order to /tmp/RtmpcOqfwj/transpose915596b6a78c7
This message is displayed once every 8 hours.
   user  system elapsed 
  1.101   0.558   1.660 

p values are the same as in Seurat but log2FC are different

bp_markers |>
mutate(log2fc = (foreground_mean - background_mean)/log(2)) |>
mutate(p_val_adj = p.adjust(p_val_raw, method = "bonferroni")) |>
dplyr::filter(p_val_adj < 0.05) |>
dplyr::filter(foreground == 13) |>
dplyr::arrange(desc(log2fc)) |>
dplyr::select(feature, p_val_raw, log2fc, p_val_adj)
# A tibble: 2,123 × 4
   feature p_val_raw log2fc p_val_adj
   <chr>       <dbl>  <dbl>     <dbl>
 1 APOE    4.40e- 42   4.40 1.67e- 36
 2 LYZ     1.00e-115   4.32 3.80e-110
 3 FTL     3.25e- 60   4.18 1.23e- 54
 4 FTH1    5.99e- 66   4.08 2.27e- 60
 5 APOC1   1.36e- 84   3.31 5.14e- 79
 6 SAT1    3.31e- 54   3.11 1.25e- 48
 7 NPC2    7.84e- 56   3.07 2.97e- 50
 8 TYROBP  2.32e- 85   3.05 8.80e- 80
 9 PSAP    8.87e- 56   3.03 3.36e- 50
10 PTGDS   4.97e- 33   3.02 1.88e- 27
# ℹ 2,113 more rows

presto markers

do not work on BPCells Matrix

 presto::wilcoxauc(seu, "seurat_clusters")
Error in UseMethod("rank_matrix"): no applicable method for 'rank_matrix' applied to an object of class "c('RenameDims', 'IterableMatrix')"

Seurat markers

takes much longer, so only calculcate for cluster 13

#this would be the corresponding command but takes very long
# seu_markers <- FindAllMarkers(seu, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
system.time(
    seu_markers_bp <- FindMarkers(seu, ident.1 = "13", ident.2 = NULL, min.pct = 0.25, logfc.threshold = 0.5)
)
   user  system elapsed 
  3.226   0.035   3.263 

without conversion into sparse matrix no p values, but log2FC are the same

seu_markers_bp |>
    arrange(desc(avg_log2FC))|>
    head(10)
       p_val avg_log2FC pct.1 pct.2 p_val_adj
LYZ       NA   7.146448 0.894 0.162        NA
APOE      NA   7.032490 0.817 0.473        NA
PTGDS     NA   6.608698 0.654 0.279        NA
S100A9    NA   6.535848 0.596 0.048        NA
APOC1     NA   6.255360 0.779 0.160        NA
GPNMB     NA   5.975639 0.760 0.055        NA
CST3      NA   5.964254 0.885 0.050        NA
CAPG      NA   5.941394 0.837 0.100        NA
S100A8    NA   5.878775 0.423 0.025        NA
CD68      NA   5.716077 0.865 0.071        NA
seu[["RNA"]]$data <- as(object = seu[["RNA"]]$data, Class = "dgCMatrix")

#this would be the corresponding command but takes very long
# seu_markers <- FindAllMarkers(seu, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
system.time(
    seu_markers <- FindMarkers(seu, ident.1 = "13", ident.2 = NULL, min.pct = 0.25, logfc.threshold = 0.5)
)
   user  system elapsed 
 38.008   0.335  38.349 
seu_markers |>
    arrange(desc(avg_log2FC))|>
    head(10)
               p_val avg_log2FC pct.1 pct.2     p_val_adj
LYZ    1.003133e-115   7.146448 0.894 0.162 2.532911e-111
APOE    4.396535e-42   7.032490 0.817 0.473  1.110125e-37
PTGDS   4.974038e-33   6.608698 0.654 0.279  1.255945e-28
S100A9 1.479987e-126   6.535848 0.596 0.048 3.736967e-122
APOC1   1.356092e-84   6.255360 0.779 0.160  3.424133e-80
GPNMB  1.002223e-184   5.975639 0.760 0.055 2.530614e-180
CST3   5.516853e-260   5.964254 0.885 0.050 1.393005e-255
CAPG   1.569736e-142   5.941394 0.837 0.100 3.963583e-138
S100A8 4.785009e-110   5.878775 0.423 0.025 1.208215e-105
CD68   2.234567e-196   5.716077 0.865 0.071 5.642281e-192

session info

sessionInfo()
R version 4.3.1 (2023-06-16)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 22.04.3 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so;  LAPACK version 3.10.0

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=de_DE.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=de_DE.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=de_DE.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C       

time zone: Europe/Berlin
tzcode source: system (glibc)

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] biomaRt_2.56.1          SeuratWrappers_0.3.19   presto_1.0.0           
 [4] data.table_1.14.8       Rcpp_1.0.11             Azimuth_0.4.6.9004     
 [7] shinyBS_0.61.1          BPCells_0.1.0           Seurat_4.9.9.9060      
[10] SeuratObject_4.9.9.9091 sp_2.0-0                lubridate_1.9.2        
[13] forcats_1.0.0           stringr_1.5.0           dplyr_1.1.3            
[16] purrr_1.0.2             readr_2.1.4             tidyr_1.3.0            
[19] tibble_3.2.1            ggplot2_3.4.3           tidyverse_2.0.0        

loaded via a namespace (and not attached):
  [1] fs_1.6.3                          ProtGenerics_1.32.0              
  [3] matrixStats_1.0.0                 spatstat.sparse_3.0-2            
  [5] bitops_1.0-7                      DirichletMultinomial_1.42.0      
  [7] TFBSTools_1.38.0                  httr_1.4.7                       
  [9] RColorBrewer_1.1-3                tools_4.3.1                      
 [11] sctransform_0.3.5                 utf8_1.2.3                       
 [13] R6_2.5.1                          DT_0.28                          
 [15] lazyeval_0.2.2                    uwot_0.1.16                      
 [17] rhdf5filters_1.12.1               withr_2.5.0                      
 [19] prettyunits_1.1.1                 gridExtra_2.3                    
 [21] progressr_0.14.0                  cli_3.6.1                        
 [23] Biobase_2.60.0                    spatstat.explore_3.2-3           
 [25] fastDummies_1.7.3                 EnsDb.Hsapiens.v86_2.99.0        
 [27] shinyjs_2.1.0                     spatstat.data_3.0-1              
 [29] ggridges_0.5.4                    pbapply_1.7-2                    
 [31] Rsamtools_2.16.0                  R.utils_2.12.2                   
 [33] parallelly_1.36.0                 limma_3.56.2                     
 [35] BSgenome_1.68.0                   rstudioapi_0.15.0                
 [37] RSQLite_2.3.1                     generics_0.1.3                   
 [39] BiocIO_1.10.0                     gtools_3.9.4                     
 [41] ica_1.0-3                         spatstat.random_3.1-6            
 [43] googlesheets4_1.1.1               GO.db_3.17.0                     
 [45] Matrix_1.6-0                      fansi_1.0.4                      
 [47] S4Vectors_0.38.1                  abind_1.4-5                      
 [49] R.methodsS3_1.8.2                 lifecycle_1.0.3                  
 [51] yaml_2.3.7                        SummarizedExperiment_1.30.2      
 [53] rhdf5_2.44.0                      BiocFileCache_2.8.0              
 [55] Rtsne_0.16                        grid_4.3.1                       
 [57] blob_1.2.4                        promises_1.2.1                   
 [59] shinydashboard_0.7.2              crayon_1.5.2                     
 [61] miniUI_0.1.1.1                    lattice_0.21-8                   
 [63] cowplot_1.1.1                     annotate_1.78.0                  
 [65] GenomicFeatures_1.52.1            KEGGREST_1.40.0                  
 [67] pillar_1.9.0                      knitr_1.44                       
 [69] GenomicRanges_1.52.0              rjson_0.2.21                     
 [71] future.apply_1.11.0               codetools_0.2-19                 
 [73] fastmatch_1.1-3                   leiden_0.4.3                     
 [75] glue_1.6.2                        remotes_2.4.2.1                  
 [77] vctrs_0.6.3                       png_0.1-8                        
 [79] spam_2.9-1                        cellranger_1.1.0                 
 [81] gtable_0.3.4                      poweRlaw_0.70.6                  
 [83] cachem_1.0.8                      xfun_0.40                        
 [85] Signac_1.10.0                     S4Arrays_1.0.5                   
 [87] mime_0.12                         pracma_2.4.2                     
 [89] survival_3.5-5                    gargle_1.5.2                     
 [91] RcppRoll_0.3.0                    ellipsis_0.3.2                   
 [93] fitdistrplus_1.1-11               ROCR_1.0-11                      
 [95] nlme_3.1-162                      bit64_4.0.5                      
 [97] progress_1.2.2                    filelock_1.0.2                   
 [99] RcppAnnoy_0.0.21                  GenomeInfoDb_1.36.1              
[101] irlba_2.3.5.1                     KernSmooth_2.23-22               
[103] SeuratDisk_0.0.0.9020             colorspace_2.1-0                 
[105] seqLogo_1.66.0                    BiocGenerics_0.46.0              
[107] DBI_1.1.3                         tidyselect_1.2.0                 
[109] bit_4.0.5                         compiler_4.3.1                   
[111] curl_5.0.2                        hdf5r_1.3.8                      
[113] xml2_1.3.5                        DelayedArray_0.26.7              
[115] plotly_4.10.2                     rtracklayer_1.60.0               
[117] scales_1.2.1                      caTools_1.18.2                   
[119] lmtest_0.9-40                     rappdirs_0.3.3                   
[121] digest_0.6.33                     goftest_1.2-3                    
[123] spatstat.utils_3.0-3              rmarkdown_2.24                   
[125] XVector_0.40.0                    htmltools_0.5.6                  
[127] pkgconfig_2.0.3                   MatrixGenerics_1.12.3            
[129] dbplyr_2.3.3                      fastmap_1.1.1                    
[131] ensembldb_2.24.0                  rlang_1.1.1                      
[133] htmlwidgets_1.6.2                 shiny_1.7.5                      
[135] zoo_1.8-12                        jsonlite_1.8.7                   
[137] BiocParallel_1.34.2               R.oo_1.25.0                      
[139] RCurl_1.98-1.12                   magrittr_2.0.3                   
[141] GenomeInfoDbData_1.2.10           dotCall64_1.0-2                  
[143] patchwork_1.1.3                   Rhdf5lib_1.22.0                  
[145] munsell_0.5.0                     reticulate_1.32.0                
[147] stringi_1.7.12                    zlibbioc_1.46.0                  
[149] MASS_7.3-60                       plyr_1.8.8                       
[151] parallel_4.3.1                    listenv_0.9.0                    
[153] ggrepel_0.9.3                     deldir_1.0-9                     
[155] CNEr_1.36.0                       Biostrings_2.68.1                
[157] splines_4.3.1                     tensor_1.5                       
[159] hms_1.1.3                         BSgenome.Hsapiens.UCSC.hg38_1.4.5
[161] igraph_1.5.1                      spatstat.geom_3.2-5              
[163] RcppHNSW_0.4.1                    reshape2_1.4.4                   
[165] stats4_4.3.1                      TFMPvalue_0.0.9                  
[167] XML_3.99-0.14                     evaluate_0.21                    
[169] BiocManager_1.30.21.1             JASPAR2020_0.99.10               
[171] tzdb_0.4.0                        httpuv_1.6.11                    
[173] RANN_2.6.1                        polyclip_1.10-4                  
[175] future_1.33.0                     SeuratData_0.2.2.9001            
[177] scattermore_1.2                   rsvd_1.0.5                       
[179] xtable_1.8-4                      restfulr_0.0.15                  
[181] AnnotationFilter_1.24.0           RSpectra_0.16-1                  
[183] later_1.3.1                       googledrive_2.1.1                
[185] viridisLite_0.4.2                 memoise_2.0.1                    
[187] AnnotationDbi_1.62.2              GenomicAlignments_1.36.0         
[189] IRanges_2.34.1                    cluster_2.1.4                    
[191] timechange_0.2.0                  globals_0.16.2