library(tidyverse)
library(Seurat)
library(BPCells)
library(Azimuth)
library(SeuratObject)
library(presto)
library(SeuratWrappers)
options(Seurat.object.assay.version = "v5")
BPCells FindMarkers
A reproducible example to compare finding top markers between Seuratv5 and BPCells
libraries
Download data
<- "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSM6176195&format=file&file=GSM6176195%5Ffiltered%5Ffeature%5Fbc%5Fmatrix%5Fp1%5Fbiopsy1%5Frna%2Eh5"
url # Only download files if we haven't downloaded already
if (!file.exists("pcnsl.h5")) {
download.file(url, "pcnsl.h5", mode="wb")
}
data loading
# Check if we already ran import
if (!file.exists("pcnsl_raw")) {
<- open_matrix_10x_hdf5("pcnsl.h5", feature_type="Gene Expression") %>%
mat_raw write_matrix_dir("pcnsl_raw")
else {
} <- open_matrix_dir("pcnsl_raw")
mat_raw }
create seurat object
<- Azimuth:::ConvertEnsembleToSymbol(mat = mat_raw, species = "human")
mat_raw <- CreateSeuratObject(counts = mat_raw) seu
Seurat pipeline
<- NormalizeData(seu)
seu <- FindVariableFeatures(seu)
seu <- ScaleData(seu)
seu <- RunPCA(seu)
seu <- FindNeighbors(seu)
seu <- FindClusters(seu, resolution = 0.8)
seu <- RunUMAP(seu, reduction = "pca", dims = 1:30) seu
BPCells markers
no need to convert and very fast
system.time(
<- BPCells::marker_features(seu$RNA$data, Idents(seu))
bp_markers )
Warning: marker features calculation requires row-major storage
• Consider using transpose_storage_order() if running marker_features repeatedly
Writing transposed storage order to /tmp/RtmpcOqfwj/transpose915596b6a78c7
This message is displayed once every 8 hours.
user system elapsed
1.101 0.558 1.660
p values are the same as in Seurat but log2FC are different
|>
bp_markers mutate(log2fc = (foreground_mean - background_mean)/log(2)) |>
mutate(p_val_adj = p.adjust(p_val_raw, method = "bonferroni")) |>
::filter(p_val_adj < 0.05) |>
dplyr::filter(foreground == 13) |>
dplyr::arrange(desc(log2fc)) |>
dplyr::select(feature, p_val_raw, log2fc, p_val_adj) dplyr
# A tibble: 2,123 × 4
feature p_val_raw log2fc p_val_adj
<chr> <dbl> <dbl> <dbl>
1 APOE 4.40e- 42 4.40 1.67e- 36
2 LYZ 1.00e-115 4.32 3.80e-110
3 FTL 3.25e- 60 4.18 1.23e- 54
4 FTH1 5.99e- 66 4.08 2.27e- 60
5 APOC1 1.36e- 84 3.31 5.14e- 79
6 SAT1 3.31e- 54 3.11 1.25e- 48
7 NPC2 7.84e- 56 3.07 2.97e- 50
8 TYROBP 2.32e- 85 3.05 8.80e- 80
9 PSAP 8.87e- 56 3.03 3.36e- 50
10 PTGDS 4.97e- 33 3.02 1.88e- 27
# ℹ 2,113 more rows
presto markers
do not work on BPCells Matrix
::wilcoxauc(seu, "seurat_clusters") presto
Error in UseMethod("rank_matrix"): no applicable method for 'rank_matrix' applied to an object of class "c('RenameDims', 'IterableMatrix')"
Seurat markers
takes much longer, so only calculcate for cluster 13
#this would be the corresponding command but takes very long
# seu_markers <- FindAllMarkers(seu, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
system.time(
<- FindMarkers(seu, ident.1 = "13", ident.2 = NULL, min.pct = 0.25, logfc.threshold = 0.5)
seu_markers_bp )
user system elapsed
3.226 0.035 3.263
without conversion into sparse matrix no p values, but log2FC are the same
|>
seu_markers_bp arrange(desc(avg_log2FC))|>
head(10)
p_val avg_log2FC pct.1 pct.2 p_val_adj
LYZ NA 7.146448 0.894 0.162 NA
APOE NA 7.032490 0.817 0.473 NA
PTGDS NA 6.608698 0.654 0.279 NA
S100A9 NA 6.535848 0.596 0.048 NA
APOC1 NA 6.255360 0.779 0.160 NA
GPNMB NA 5.975639 0.760 0.055 NA
CST3 NA 5.964254 0.885 0.050 NA
CAPG NA 5.941394 0.837 0.100 NA
S100A8 NA 5.878775 0.423 0.025 NA
CD68 NA 5.716077 0.865 0.071 NA
"RNA"]]$data <- as(object = seu[["RNA"]]$data, Class = "dgCMatrix")
seu[[
#this would be the corresponding command but takes very long
# seu_markers <- FindAllMarkers(seu, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
system.time(
<- FindMarkers(seu, ident.1 = "13", ident.2 = NULL, min.pct = 0.25, logfc.threshold = 0.5)
seu_markers )
user system elapsed
38.008 0.335 38.349
|>
seu_markers arrange(desc(avg_log2FC))|>
head(10)
p_val avg_log2FC pct.1 pct.2 p_val_adj
LYZ 1.003133e-115 7.146448 0.894 0.162 2.532911e-111
APOE 4.396535e-42 7.032490 0.817 0.473 1.110125e-37
PTGDS 4.974038e-33 6.608698 0.654 0.279 1.255945e-28
S100A9 1.479987e-126 6.535848 0.596 0.048 3.736967e-122
APOC1 1.356092e-84 6.255360 0.779 0.160 3.424133e-80
GPNMB 1.002223e-184 5.975639 0.760 0.055 2.530614e-180
CST3 5.516853e-260 5.964254 0.885 0.050 1.393005e-255
CAPG 1.569736e-142 5.941394 0.837 0.100 3.963583e-138
S100A8 4.785009e-110 5.878775 0.423 0.025 1.208215e-105
CD68 2.234567e-196 5.716077 0.865 0.071 5.642281e-192
session info
sessionInfo()
R version 4.3.1 (2023-06-16)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 22.04.3 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so; LAPACK version 3.10.0
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=de_DE.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=de_DE.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=de_DE.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C
time zone: Europe/Berlin
tzcode source: system (glibc)
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] biomaRt_2.56.1 SeuratWrappers_0.3.19 presto_1.0.0
[4] data.table_1.14.8 Rcpp_1.0.11 Azimuth_0.4.6.9004
[7] shinyBS_0.61.1 BPCells_0.1.0 Seurat_4.9.9.9060
[10] SeuratObject_4.9.9.9091 sp_2.0-0 lubridate_1.9.2
[13] forcats_1.0.0 stringr_1.5.0 dplyr_1.1.3
[16] purrr_1.0.2 readr_2.1.4 tidyr_1.3.0
[19] tibble_3.2.1 ggplot2_3.4.3 tidyverse_2.0.0
loaded via a namespace (and not attached):
[1] fs_1.6.3 ProtGenerics_1.32.0
[3] matrixStats_1.0.0 spatstat.sparse_3.0-2
[5] bitops_1.0-7 DirichletMultinomial_1.42.0
[7] TFBSTools_1.38.0 httr_1.4.7
[9] RColorBrewer_1.1-3 tools_4.3.1
[11] sctransform_0.3.5 utf8_1.2.3
[13] R6_2.5.1 DT_0.28
[15] lazyeval_0.2.2 uwot_0.1.16
[17] rhdf5filters_1.12.1 withr_2.5.0
[19] prettyunits_1.1.1 gridExtra_2.3
[21] progressr_0.14.0 cli_3.6.1
[23] Biobase_2.60.0 spatstat.explore_3.2-3
[25] fastDummies_1.7.3 EnsDb.Hsapiens.v86_2.99.0
[27] shinyjs_2.1.0 spatstat.data_3.0-1
[29] ggridges_0.5.4 pbapply_1.7-2
[31] Rsamtools_2.16.0 R.utils_2.12.2
[33] parallelly_1.36.0 limma_3.56.2
[35] BSgenome_1.68.0 rstudioapi_0.15.0
[37] RSQLite_2.3.1 generics_0.1.3
[39] BiocIO_1.10.0 gtools_3.9.4
[41] ica_1.0-3 spatstat.random_3.1-6
[43] googlesheets4_1.1.1 GO.db_3.17.0
[45] Matrix_1.6-0 fansi_1.0.4
[47] S4Vectors_0.38.1 abind_1.4-5
[49] R.methodsS3_1.8.2 lifecycle_1.0.3
[51] yaml_2.3.7 SummarizedExperiment_1.30.2
[53] rhdf5_2.44.0 BiocFileCache_2.8.0
[55] Rtsne_0.16 grid_4.3.1
[57] blob_1.2.4 promises_1.2.1
[59] shinydashboard_0.7.2 crayon_1.5.2
[61] miniUI_0.1.1.1 lattice_0.21-8
[63] cowplot_1.1.1 annotate_1.78.0
[65] GenomicFeatures_1.52.1 KEGGREST_1.40.0
[67] pillar_1.9.0 knitr_1.44
[69] GenomicRanges_1.52.0 rjson_0.2.21
[71] future.apply_1.11.0 codetools_0.2-19
[73] fastmatch_1.1-3 leiden_0.4.3
[75] glue_1.6.2 remotes_2.4.2.1
[77] vctrs_0.6.3 png_0.1-8
[79] spam_2.9-1 cellranger_1.1.0
[81] gtable_0.3.4 poweRlaw_0.70.6
[83] cachem_1.0.8 xfun_0.40
[85] Signac_1.10.0 S4Arrays_1.0.5
[87] mime_0.12 pracma_2.4.2
[89] survival_3.5-5 gargle_1.5.2
[91] RcppRoll_0.3.0 ellipsis_0.3.2
[93] fitdistrplus_1.1-11 ROCR_1.0-11
[95] nlme_3.1-162 bit64_4.0.5
[97] progress_1.2.2 filelock_1.0.2
[99] RcppAnnoy_0.0.21 GenomeInfoDb_1.36.1
[101] irlba_2.3.5.1 KernSmooth_2.23-22
[103] SeuratDisk_0.0.0.9020 colorspace_2.1-0
[105] seqLogo_1.66.0 BiocGenerics_0.46.0
[107] DBI_1.1.3 tidyselect_1.2.0
[109] bit_4.0.5 compiler_4.3.1
[111] curl_5.0.2 hdf5r_1.3.8
[113] xml2_1.3.5 DelayedArray_0.26.7
[115] plotly_4.10.2 rtracklayer_1.60.0
[117] scales_1.2.1 caTools_1.18.2
[119] lmtest_0.9-40 rappdirs_0.3.3
[121] digest_0.6.33 goftest_1.2-3
[123] spatstat.utils_3.0-3 rmarkdown_2.24
[125] XVector_0.40.0 htmltools_0.5.6
[127] pkgconfig_2.0.3 MatrixGenerics_1.12.3
[129] dbplyr_2.3.3 fastmap_1.1.1
[131] ensembldb_2.24.0 rlang_1.1.1
[133] htmlwidgets_1.6.2 shiny_1.7.5
[135] zoo_1.8-12 jsonlite_1.8.7
[137] BiocParallel_1.34.2 R.oo_1.25.0
[139] RCurl_1.98-1.12 magrittr_2.0.3
[141] GenomeInfoDbData_1.2.10 dotCall64_1.0-2
[143] patchwork_1.1.3 Rhdf5lib_1.22.0
[145] munsell_0.5.0 reticulate_1.32.0
[147] stringi_1.7.12 zlibbioc_1.46.0
[149] MASS_7.3-60 plyr_1.8.8
[151] parallel_4.3.1 listenv_0.9.0
[153] ggrepel_0.9.3 deldir_1.0-9
[155] CNEr_1.36.0 Biostrings_2.68.1
[157] splines_4.3.1 tensor_1.5
[159] hms_1.1.3 BSgenome.Hsapiens.UCSC.hg38_1.4.5
[161] igraph_1.5.1 spatstat.geom_3.2-5
[163] RcppHNSW_0.4.1 reshape2_1.4.4
[165] stats4_4.3.1 TFMPvalue_0.0.9
[167] XML_3.99-0.14 evaluate_0.21
[169] BiocManager_1.30.21.1 JASPAR2020_0.99.10
[171] tzdb_0.4.0 httpuv_1.6.11
[173] RANN_2.6.1 polyclip_1.10-4
[175] future_1.33.0 SeuratData_0.2.2.9001
[177] scattermore_1.2 rsvd_1.0.5
[179] xtable_1.8-4 restfulr_0.0.15
[181] AnnotationFilter_1.24.0 RSpectra_0.16-1
[183] later_1.3.1 googledrive_2.1.1
[185] viridisLite_0.4.2 memoise_2.0.1
[187] AnnotationDbi_1.62.2 GenomicAlignments_1.36.0
[189] IRanges_2.34.1 cluster_2.1.4
[191] timechange_0.2.0 globals_0.16.2