knitr::opts_chunk$set(echo = TRUE, message = FALSE)
# Set the directory to where everything is located
knitr::opts_knit$set(root.dir = '/Users/dfhannum/Desktop/OphthalmApp/') # sets it for all code chunks
library(Seurat) # for scRNA functions and plots
## Loading required package: SeuratObject
## Loading required package: sp
##
## Attaching package: 'SeuratObject'
## The following objects are masked from 'package:base':
##
## intersect, t
library(ggplot2) # for custom plotting
library(data.table) # data file processing and tstrsplit
library(clusterProfiler) # pathway analysis
##
## clusterProfiler v4.12.6 Learn more at https://yulab-smu.top/contribution-knowledge-mining/
##
## Please cite:
##
## S Xu, E Hu, Y Cai, Z Xie, X Luo, L Zhan, W Tang, Q Wang, B Liu, R Wang,
## W Xie, T Wu, L Xie, G Yu. Using clusterProfiler to characterize
## multiomics data. Nature Protocols. 2024, 19(11):3292-3320
##
## Attaching package: 'clusterProfiler'
## The following object is masked from 'package:stats':
##
## filter
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following object is masked from 'package:SeuratObject':
##
## intersect
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, table,
## tapply, union, unique, unsplit, which.max, which.min
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: IRanges
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:clusterProfiler':
##
## rename
## The following objects are masked from 'package:data.table':
##
## first, second
## The following object is masked from 'package:utils':
##
## findMatches
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:clusterProfiler':
##
## slice
## The following object is masked from 'package:data.table':
##
## shift
## The following object is masked from 'package:sp':
##
## %over%
##
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:clusterProfiler':
##
## select
##
In this document we are going to use an already created Seurat scRNA object. We are then going to identify cluster markers, and submit those cluster markers for over-representation (enrichment) analysis with MSigDB. We will then make some plots representing the most significant pathways.
I’m using a previously published dataset that I’ve reanalyzed for other example scripts.
We are going to rerun the clustering to identify a smaller number of clusters. Many of these clusters will share similar transcriptomic profiles. For the speed/visualizations of my downstream example it’ll be easier working with fewer clusters.
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 36827
## Number of edges: 1171837
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9915
## Number of communities: 14
## Elapsed time: 4 seconds
# Useful function to find markers for each cluster compared to all others
markers <- FindAllMarkers(object = obj,
only.pos = T, # focusing on upregulated markers
test.use = 'wilcox', # the default test that runs quickly
logfc.threshold = .2 # setting a strong cutoff
)
## Warning: The `slot` argument of `GetAssayData()` is deprecated as of SeuratObject 5.0.0.
## ℹ Please use the `layer` argument instead.
## ℹ The deprecated feature was likely used in the Seurat package.
## Please report the issue at <https://github.com/satijalab/seurat/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: `PackageCheck()` was deprecated in SeuratObject 5.0.0.
## ℹ Please use `rlang::check_installed()` instead.
## ℹ The deprecated feature was likely used in the Seurat package.
## Please report the issue at <https://github.com/satijalab/seurat/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Next we are going to extract the differentially expressed genes for each cluster
# Creating an empty list to add to
marker_list <- list()
# A for loop to loop through the clusters
# You can replace obj$seurat_clusters with your cluster identity
for (i in levels(obj$seurat_clusters)){
# Print statements are always useful in for loops to track progress and to
# make sure it's doing what it's suppose to
print(paste0("Cluster ", i))
# Isolating markers for the given cluster
temp <- markers[markers$cluster == i,]
# Only keeping significant markers
temp <- temp[temp$p_val_adj < .05,]
# Printing a statement saying how many markers we have
print(paste0("Contains ", dim(temp)[1], ' markers'))
# Adding the markers/gene to the list indexed by cluster
marker_list[[i]] <- temp$gene
}
## [1] "Cluster 0"
## [1] "Contains 3240 markers"
## [1] "Cluster 1"
## [1] "Contains 5091 markers"
## [1] "Cluster 2"
## [1] "Contains 2368 markers"
## [1] "Cluster 3"
## [1] "Contains 5532 markers"
## [1] "Cluster 4"
## [1] "Contains 3057 markers"
## [1] "Cluster 5"
## [1] "Contains 1246 markers"
## [1] "Cluster 6"
## [1] "Contains 2237 markers"
## [1] "Cluster 7"
## [1] "Contains 2042 markers"
## [1] "Cluster 8"
## [1] "Contains 1580 markers"
## [1] "Cluster 9"
## [1] "Contains 4231 markers"
## [1] "Cluster 10"
## [1] "Contains 1875 markers"
## [1] "Cluster 11"
## [1] "Contains 2724 markers"
## [1] "Cluster 12"
## [1] "Contains 2712 markers"
## [1] "Cluster 13"
## [1] "Contains 2101 markers"
We will use the molecular signatures database (MSigDB) to load their mouse collections to then run enrichment analysis with clusterProfiler
In this chunk of code we will generate the database for use in clusterProfiler.
# Loading all gene sets present in MSigDB for mouse
all_gene_sets <- msigdbr(species = 'Mus musculus')
head(all_gene_sets) # looking at the first six rows of the data
## # A tibble: 6 × 23
## gene_symbol ncbi_gene ensembl_gene db_gene_symbol db_ncbi_gene db_ensembl_gene
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Abcc4 239273 ENSMUSG0000… ABCC4 10257 ENSG00000125257
## 2 Abraxas2 109359 ENSMUSG0000… ABRAXAS2 23172 ENSG00000165660
## 3 Actn4 60595 ENSMUSG0000… ACTN4 81 ENSG00000130402
## 4 Acvr1 11477 ENSMUSG0000… ACVR1 90 ENSG00000115170
## 5 Adam9 11502 ENSMUSG0000… ADAM9 8754 ENSG00000168615
## 6 Adamts5 23794 ENSMUSG0000… ADAMTS5 11096 ENSG00000154736
## # ℹ 17 more variables: source_gene <chr>, gs_id <chr>, gs_name <chr>,
## # gs_collection <chr>, gs_subcollection <chr>, gs_collection_name <chr>,
## # gs_description <chr>, gs_source_species <chr>, gs_pmid <chr>,
## # gs_geoid <chr>, gs_exact_source <chr>, gs_url <chr>, db_version <chr>,
## # db_target_species <chr>, ortholog_taxon_id <int>, ortholog_sources <chr>,
## # num_ortholog_sources <dbl>
Next we will look at the different collections in MSigDB.
## [1] "BioCarta Pathways"
## [2] "Cancer Gene Neighborhoods"
## [3] "Cancer Modules"
## [4] "Canonical Pathways"
## [5] "Cell Type Signature"
## [6] "Chemical and Genetic Perturbations"
## [7] "Curated Cancer Cell Atlas gene sets "
## [8] "GO Biological Process"
## [9] "GO Cellular Component"
## [10] "GO Molecular Function"
## [11] "GTRD"
## [12] "Hallmark"
## [13] "HIPC Vaccine Response"
## [14] "Human Phenotype Ontology"
## [15] "ImmuneSigDB"
## [16] "KEGG Legacy Pathways"
## [17] "KEGG Medicus Pathways"
## [18] "MIR_Legacy"
## [19] "miRDB"
## [20] "Oncogenic Signature"
## [21] "PID Pathways"
## [22] "Positional"
## [23] "Reactome Pathways"
## [24] "TFT_Legacy"
## [25] "WikiPathways"
More details about the collections are available.
## # A tibble: 6 × 4
## gs_collection gs_subcollection gs_collection_name num_genesets
## <chr> <chr> <chr> <int>
## 1 C1 "" Positional 302
## 2 C2 "CGP" Chemical and Genetic Perturbatio… 3494
## 3 C2 "CP" Canonical Pathways 19
## 4 C2 "CP:BIOCARTA" BioCarta Pathways 292
## 5 C2 "CP:KEGG_LEGACY" KEGG Legacy Pathways 186
## 6 C2 "CP:KEGG_MEDICUS" KEGG Medicus Pathways 658
Next we will run the enricher function on the top markers from each cluster.
pathway_results <- list()
for (i in names(marker_list)){
print(i)
res <- enricher(gene = marker_list[[i]], # cluster marker genes
TERM2GENE = msigdbr_t2g, # reference database
pvalueCutoff = 1)
pathway_results[[i]] <- res
}
## [1] "0"
## [1] "1"
## [1] "2"
## [1] "3"
## [1] "4"
## [1] "5"
## [1] "6"
## [1] "7"
## [1] "8"
## [1] "9"
## [1] "10"
## [1] "11"
## [1] "12"
## [1] "13"
## #
## # over-representation test
## #
## #...@organism UNKNOWN
## #...@ontology UNKNOWN
## #...@gene chr [1:3240] "Slc24a1" "Cnga1" "Nr2e3" "Cngb1" "Impg1" "Pde6a" "Samd11" ...
## #...pvalues adjusted by 'BH' with cutoff <1
## #...4674 enriched terms found
## 'data.frame': 4674 obs. of 12 variables:
## $ ID : chr "CRX_NRL_DN.V1_UP" "CRX_DN.V1_UP" "NRL_DN.V1_UP" "HP_NYCTALOPIA" ...
## $ Description : chr "CRX_NRL_DN.V1_UP" "CRX_DN.V1_UP" "NRL_DN.V1_UP" "HP_NYCTALOPIA" ...
## $ GeneRatio : chr "109/2824" "98/2824" "95/2824" "105/2824" ...
## $ BgRatio : chr "137/17963" "132/17963" "136/17963" "218/17963" ...
## $ RichFactor : num 0.796 0.742 0.699 0.482 0.471 ...
## $ FoldEnrichment: num 5.06 4.72 4.44 3.06 3 ...
## $ zScore : num 20.6 18.5 17.4 13.2 12.4 ...
## $ pvalue : num 5.15e-62 6.54e-51 1.53e-45 1.46e-29 1.92e-26 ...
## $ p.adjust : num 1.35e-57 8.61e-47 1.34e-41 9.62e-26 1.01e-22 ...
## $ qvalue : num 1.16e-57 7.36e-47 1.15e-41 8.23e-26 8.66e-23 ...
## $ geneID : chr "Slc24a1/Cnga1/Nr2e3/Cngb1/Impg1/Pde6a/Samd11/Nrl/Reep6/Esrrb/Rdh12/Rtbdn/Pla2r1/Nxnl2/Kcnj14/Dpf3/Grtp1/Faim/Gu"| __truncated__ "Slc24a1/Cngb1/Pde6a/Reep6/Vtn/Esrrb/Rdh12/Rtbdn/Grk1/Abca4/Pla2r1/Nxnl2/Kcnj14/Dpf3/Grtp1/Faim/Guca1b/Pfkfb2/Cd"| __truncated__ "Slc24a1/Cnga1/Nr2e3/Cngb1/Impg1/Pde6a/Samd11/Nrl/Reep6/Esrrb/Pde6b/Pla2r1/Kcnj14/Dpf3/Grtp1/Faim/Bbs9/Cdr2/Nxnl"| __truncated__ "Slc24a1/Cnga1/Nr2e3/Cngb1/Impg1/Pde6a/Nrl/Reep6/Prom1/Pde6b/Rdh12/Grk1/Abca4/Ush2a/Aipl1/Fscn2/Guca1b/Pitpnm3/B"| __truncated__ ...
## $ Count : int 109 98 95 105 97 164 64 59 105 86 ...
## #...Citation
## T Wu, E Hu, S Xu, M Chen, P Guo, Z Dai, T Feng, L Zhou, W Tang, L Zhan, X Fu, S Liu, X Bo, and G Yu.
## clusterProfiler 4.0: A universal enrichment tool for interpreting omics data.
## The Innovation. 2021, 2(3):100141
## [1] 4674 12
## ID
## CRX_NRL_DN.V1_UP CRX_NRL_DN.V1_UP
## CRX_DN.V1_UP CRX_DN.V1_UP
## NRL_DN.V1_UP NRL_DN.V1_UP
## HP_NYCTALOPIA HP_NYCTALOPIA
## HP_ABNORMAL_ELECTRORETINOGRAM HP_ABNORMAL_ELECTRORETINOGRAM
## GOBP_MRNA_PROCESSING GOBP_MRNA_PROCESSING
## Description GeneRatio BgRatio
## CRX_NRL_DN.V1_UP CRX_NRL_DN.V1_UP 109/2824 137/17963
## CRX_DN.V1_UP CRX_DN.V1_UP 98/2824 132/17963
## NRL_DN.V1_UP NRL_DN.V1_UP 95/2824 136/17963
## HP_NYCTALOPIA HP_NYCTALOPIA 105/2824 218/17963
## HP_ABNORMAL_ELECTRORETINOGRAM HP_ABNORMAL_ELECTRORETINOGRAM 97/2824 206/17963
## GOBP_MRNA_PROCESSING GOBP_MRNA_PROCESSING 164/2824 469/17963
## RichFactor FoldEnrichment zScore pvalue
## CRX_NRL_DN.V1_UP 0.7956204 5.060811 20.60664 5.148974e-62
## CRX_DN.V1_UP 0.7424242 4.722439 18.53906 6.541578e-51
## NRL_DN.V1_UP 0.6985294 4.443231 17.40835 1.530170e-45
## HP_NYCTALOPIA 0.4816514 3.063705 13.24034 1.462337e-29
## HP_ABNORMAL_ELECTRORETINOGRAM 0.4708738 2.995151 12.43900 1.923784e-26
## GOBP_MRNA_PROCESSING 0.3496802 2.224258 11.60315 2.534509e-25
## p.adjust qvalue
## CRX_NRL_DN.V1_UP 1.354747e-57 1.158465e-57
## CRX_DN.V1_UP 8.605773e-47 7.358931e-47
## NRL_DN.V1_UP 1.342010e-41 1.147574e-41
## HP_NYCTALOPIA 9.618888e-26 8.225262e-26
## HP_ABNORMAL_ELECTRORETINOGRAM 1.012334e-22 8.656624e-23
## GOBP_MRNA_PROCESSING 1.034829e-21 8.848982e-22
## geneID
## CRX_NRL_DN.V1_UP Slc24a1/Cnga1/Nr2e3/Cngb1/Impg1/Pde6a/Samd11/Nrl/Reep6/Esrrb/Rdh12/Rtbdn/Pla2r1/Nxnl2/Kcnj14/Dpf3/Grtp1/Faim/Guca1b/Bbs9/Cdr2/Nxnl1/Sntg2/Ankrd33b/Cabp4/Gpsm2/Drd4/Atp8a2/Kcnb1/Sntb2/Sh2d1a/Bicdl1/Arhgap10/Mef2c/Tnfaip3/Slc16a6/Polg2/Diaph3/Ppp1r42/Taok3/Pdia5/B4galt1/Slc25a25/Rnf207/Gnat1/Alpl/St8sia1/Ctif/Plpp2/Snta1/Ttc39c/Aqp1/Camk1d/Ext1/Rho/Kmt2c/Ccn4/Plekha2/Llgl2/Lrrc2/St3gal1/Crocc/Col19a1/Dhrs3/Klhl36/Gas7/Bst1/Prkab1/Ssbp4/Ahrr/Sap30/Sec14l2/Ubxn11/Tcp11/Serinc4/Hcls1/Ppara/Galnt10/Ier3/Stk17b/Dhh/Zfp36l2/Psd/Arg2/Ppargc1b/Pip4k2c/Pde8a/Armc1/Hspa1a/Mest/Tnip2/Hspa1b/Gadd45a/Edn2/Mlf1/Paqr4/Abhd14a/Nrm/2610528A11Rik/Rhpn1/Rnf123/Bub1b/Ckmt1/Slc16a3/Abcg4/Slc29a2/Cdkn2c/Eps8l1/Ptpn18
## CRX_DN.V1_UP Slc24a1/Cngb1/Pde6a/Reep6/Vtn/Esrrb/Rdh12/Rtbdn/Grk1/Abca4/Pla2r1/Nxnl2/Kcnj14/Dpf3/Grtp1/Faim/Guca1b/Pfkfb2/Cdr2/Nxnl1/Sntg2/Cabp4/Gpsm2/Fam161a/Sntb2/Sh2d1a/Bicdl1/Mdm1/Pacrg/Arhgap10/Mef2c/Tnfaip3/Ppp3cc/Rpgrip1/Tmlhe/Rabgef1/Polg2/Slc17a1/Ppp1r42/Synpo2/Taok3/Rcvrn/Plekhf2/Pdia5/B4galt1/Slc25a25/Rnf207/Gnat1/Alpl/Ankrd33/St8sia1/Plpp2/Snta1/Trafd1/Ttc39c/Aqp1/Ddhd1/Prph2/Stk35/Ccn4/Suv39h2/Ubtd1/Llgl2/Dock8/Jag1/Col19a1/Gas7/Bst1/Ubxn11/Tcp11/Serinc4/Hcls1/Ier3/Stk17b/Dhh/Dusp14/Wdr31/Arg2/Fam107b/Hspa1a/Hspa1b/Csgalnact2/Paqr4/Sh3yl1/Nrm/Rhpn1/Rnf123/Bub1b/Gpsm3/Bcar3/Klkb1/Faah/Rad51/Ckmt1/Alpk2/Tmem229b/Insl5/Cltrn
## NRL_DN.V1_UP Slc24a1/Cnga1/Nr2e3/Cngb1/Impg1/Pde6a/Samd11/Nrl/Reep6/Esrrb/Pde6b/Pla2r1/Kcnj14/Dpf3/Grtp1/Faim/Bbs9/Cdr2/Nxnl1/Sntg2/Gpsm2/Atp8a2/Kcnb1/Sntb2/Sh2d1a/Bicdl1/Arhgap10/Mef2c/Tnfaip3/Slc16a6/Galnt7/Polg2/Taok3/Plekhf2/Ebpl/Pdia5/B4galt1/Rnf207/Gnat1/Alpl/St8sia1/Ctif/Plpp2/Snta1/Ttc39c/Aqp1/Ext1/Gnb1/Stk35/Rho/Kmt2c/Ccn4/Trpc1/Plekha2/Lmo1/Llgl2/Lrrc2/St3gal1/Kmt2a/Col19a1/Dhrs3/Klhl36/Gas7/Bst1/Ahrr/Eml3/Sap30/Styx/Sec14l2/Tcp11/Ppara/Galnt10/Stk17b/Zfp36l2/Dusp14/Arg2/Ppargc1b/Armc1/Cdk5r2/Plagl2/Gadd45a/Edn2/Mlf1/Supt6/Rhbdl3/Ccna2/2610528A11Rik/Rhpn1/Ngdn/Cldn23/Shisa2/Zdhhc6/Stard10/Slc29a2/Hic2
## HP_NYCTALOPIA Slc24a1/Cnga1/Nr2e3/Cngb1/Impg1/Pde6a/Nrl/Reep6/Prom1/Pde6b/Rdh12/Grk1/Abca4/Ush2a/Aipl1/Fscn2/Guca1b/Pitpnm3/Bbs9/Prcd/Tulp1/Impg2/Cabp4/Fam161a/Rbp3/Rp1l1/Elovl4/Mak/Kcnv2/Rp1/Tub/Cdhr1/Cerkl/Gucy2e/Rpgrip1/Bbs7/Crx/Bbs5/Gnat1/Cep290/Pcare/Adgrv1/Guca1a/Pcdh15/Arl6/Impdh1/Prph2/Rab28/Pde6g/Rom1/Rho/Arl3/Sag/Unc119/Lca5/Cacna1f/Whrn/Ttc8/Ahr/Hk1/Crb1/Cacna2d4/Ttll5/Poc1b/Arsg/Hadha/Cc2d2a/Kif3b/Bbs2/Bbs12/Rpgr/D630045J12Rik/Wdpcp/Ift74/Trnt1/Vps13b/Pex14/Snrnp200/Pomgnt1/Zpr1/Cep19/Lztfl1/Pex3/Phyh/Ift43/Dhdds/Prpf4/Prpf3/Pex1/Ift88/Stim1/Agbl5/Ggcx/Zfp513/Bbs4/Pex10/Cwc27/Exosc2/Mkks/Sdccag8/Ift172/Cog4/Bbs1/Dhx38/Ofd1
## HP_ABNORMAL_ELECTRORETINOGRAM Slc24a1/Cnga1/Nr2e3/Cngb1/Impg1/Pde6a/Nrl/Reep6/Prom1/Pde6b/Rdh12/Rs1/Grk1/Abca4/Ush2a/Aipl1/Fscn2/Guca1b/Bbs9/Prcd/Tulp1/Impg2/Cabp4/Fam161a/Rbp3/Rp1l1/Mak/Rp1/Tub/Cdhr1/Cerkl/Gucy2e/Rpgrip1/Bbs7/Crx/Bbs5/Gnat1/Cep290/Pcare/Adgrv1/Guca1a/Pcdh15/Arl6/Impdh1/Prph2/Pde6g/Gnb5/Rom1/Wwox/Rho/Arl3/Sag/Lca5/Cacna1f/Whrn/Ttc8/Ahr/Crb1/Cacna2d4/Pde6d/Ttll5/Rd3/Arsg/Hadha/Cc2d2a/Kif3b/Bbs2/Bbs12/Rpgr/D630045J12Rik/Wdpcp/Ift74/Pcyt1a/Large1/Snrnp200/Pomgnt1/Cep19/Lztfl1/Nup54/Dhdds/Prpf4/Prpf3/Pex1/Ift88/Agbl5/Zfp513/Rnf216/Bbs4/Dnajc5/Cwc27/Cln3/Mkks/Sdccag8/Ift172/Bbs1/Dhx38/Ofd1
## GOBP_MRNA_PROCESSING Mbnl1/Ahcyl1/Pcbp4/Hnrnpu/Rsrp1/Ddx5/Mbnl2/Thoc5/Ssu72/Ybx1/Rbm20/Rrp1b/Ppargc1a/Hnrnpf/Hnrnph3/Son/Srpk2/Celf2/Rbm7/Srsf7/Thoc2/Raly/Ddx17/Upf3b/Ddx42/Zrsr2/Sf3a3/Upf3a/Sf3a1/Snrnp70/Snrpc/Ncbp2/Ppil2/Supt6/Dyrk1a/Hnrnpul1/Dcps/Tent4b/Cdk11b/Nsrp1/Prpf39/Eif4a3/Luc7l/Alyref/Cdk12/Mtpap/Ptcd2/Sf3b3/Sart1/Ncbp1/Htatsf1/Snrnp200/Pcif1/Kin/Zpr1/Cdc5l/Trmt10c/Srsf3/Ppil1/Casc3/Ppie/Fra10ac1/Clasrp/Leo1/Aar2/Acin1/Yju2/Ppwd1/Ppih/Sugp1/Rbm28/Prmt7/Ddx23/Ccar2/Wbp4/Sart3/Thrap3/Thoc6/Dazap1/Prpf40a/Srsf4/Tsen2/Prkrip1/Nsun2/Rbm14/Prpf4/Ess2/Safb/Cbll1/Prpf3/Eftud2/Smn1/Prpf40b/Alkbh5/Xab2/Exosc10/Upf1/Kat8/Xrn2/Sf3a2/Wtap/Dbr1/Nrde2/Cactin/Prmt9/Chd8/Cwc22/Sf1/Rbm25/Tent4a/Rbm10/Ccdc130/Tent2/Hnrnpl/Tcerg1/Papolg/Ccdc84/Rngtt/Nup98/Ppp1r8/Iws1/Cstf1/Srrm1/Gemin5/Pde12/Sympk/Rbm48/Tfip11/Cnot6l/Rbm15/Cstf2t/Mettl14/Pnpt1/Hsf1/Prmt5/Safb2/Rbm4/Trub2/Tra2a/Srrm2/Rbm15b/Cwc27/Tbrg4/Cpsf1/Dhx15/Snrnp27/Papolb/Gemin2/Cwc25/Rbm41/Cpsf4/Dhx35/Usp4/Ern1/Dhx8/Dhx38/Cpsf7/Ncbp3/Cmtr2/Cpsf6/Khdc4/Paxbp1/Apobec2/Dus3l
## Count
## CRX_NRL_DN.V1_UP 109
## CRX_DN.V1_UP 98
## NRL_DN.V1_UP 95
## HP_NYCTALOPIA 105
## HP_ABNORMAL_ELECTRORETINOGRAM 97
## GOBP_MRNA_PROCESSING 164
Here we have the gene set name but not the collection they belong to, so we’ll need to map back to that.
# creating a dataframe with gene sets mapped to their collection name
collections <- as.data.frame(dplyr::distinct(all_gene_sets, gs_name, gs_collection_name))
# # Checking that they all overlap
# table(temp$ID %in% collections$gs_name)
rownames(collections) <- collections$gs_name
temp$collection <- collections[temp$ID,]$gs_collection_name
# Looking at how many entries we have for each collection in our results
table(temp$collection)
##
## BioCarta Pathways Cancer Gene Neighborhoods
## 19 116
## Cancer Modules Canonical Pathways
## 44 1
## Cell Type Signature Chemical and Genetic Perturbations
## 24 215
## Curated Cancer Cell Atlas gene sets GO Biological Process
## 4 475
## GO Cellular Component GO Molecular Function
## 134 180
## GTRD Hallmark
## 123 6
## HIPC Vaccine Response Human Phenotype Ontology
## 28 685
## ImmuneSigDB KEGG Legacy Pathways
## 1181 11
## KEGG Medicus Pathways MIR_Legacy
## 27 128
## miRDB Oncogenic Signature
## 866 20
## PID Pathways Positional
## 20 18
## Reactome Pathways TFT_Legacy
## 187 108
## WikiPathways
## 54
# Looking at the number of significant entries (TRUE) for each collection
table(temp$collection,temp$p.adjust < .05)
##
## FALSE TRUE
## BioCarta Pathways 15 4
## Cancer Gene Neighborhoods 52 64
## Cancer Modules 30 14
## Canonical Pathways 1 0
## Cell Type Signature 11 13
## Chemical and Genetic Perturbations 115 100
## Curated Cancer Cell Atlas gene sets 0 4
## GO Biological Process 244 231
## GO Cellular Component 54 80
## GO Molecular Function 86 94
## GTRD 43 80
## Hallmark 2 4
## HIPC Vaccine Response 7 21
## Human Phenotype Ontology 364 321
## ImmuneSigDB 564 617
## KEGG Legacy Pathways 5 6
## KEGG Medicus Pathways 21 6
## MIR_Legacy 48 80
## miRDB 440 426
## Oncogenic Signature 10 10
## PID Pathways 9 11
## Positional 14 4
## Reactome Pathways 87 100
## TFT_Legacy 58 50
## WikiPathways 32 22
Next we’ll view and plot the WikiPathways results, one of the collections in MSigDB.
wiki <- temp[temp$collection == 'WikiPathways',] # subsetting to just Wiki results
wiki$id <- gsub('WP_','', wiki$ID) # removing the WP_ preceder from term IDs
wiki$id <- substr(wiki$id,1,30) # limiting to thirty characters for ID bc some get long
wiki$id <- factor(wiki$id, levels = rev(unique(wiki$id))) # ordering them by p-vals
ggplot(wiki[1:10,], # only plotting the first ten
aes(x = -log10(p.adjust),
y = id,
color = FoldEnrichment,
size = Count)) +
theme_bw() + # my theme of choice
geom_point() +
scale_color_gradient(low = 'pink', high = 'darkred') + # creating a monotonic color scheme
# creating a significance cutoff to let viewers know all values plotted are significant
geom_vline(xintercept = -log10(0.05), colour = 'lightgrey', linetype = 2, linewidth = 1) +
# increasing the xlims because the larger points were getting cut off
xlim(c(-log10(0.05),-log10(min(wiki$p.adjust)) + 1)) +
# cleaning up the labels on the graph
labs(fill = 'Fold\nEnrichment',
size = 'Associated\nGenes') +
xlab('-log10 FDR') +
ylab("WikiPathway Terms") +
ggtitle("Top Ten WikiPathway Terms Associated with C1")