knitr::opts_chunk$set(echo = TRUE, message = FALSE)

# Set the directory to where everything is located
knitr::opts_knit$set(root.dir = '/Users/dfhannum/Desktop/OphthalmApp/') # sets it for all code chunks

library(Seurat) # for scRNA functions and plots

## Loading required package: SeuratObject

## Loading required package: sp

## 
## Attaching package: 'SeuratObject'

## The following objects are masked from 'package:base':
## 
##     intersect, t

library(ggplot2) # for custom plotting
library(data.table) # data file processing and tstrsplit
library(clusterProfiler) # pathway analysis

##

## clusterProfiler v4.12.6 Learn more at https://yulab-smu.top/contribution-knowledge-mining/
## 
## Please cite:
## 
## S Xu, E Hu, Y Cai, Z Xie, X Luo, L Zhan, W Tang, Q Wang, B Liu, R Wang,
## W Xie, T Wu, L Xie, G Yu. Using clusterProfiler to characterize
## multiomics data. Nature Protocols. 2024, 19(11):3292-3320

## 
## Attaching package: 'clusterProfiler'

## The following object is masked from 'package:stats':
## 
##     filter

library(org.Mm.eg.db) # <org.Hs.eg.db> human or <org.Rn.eg.db> rat

## Loading required package: AnnotationDbi

## Loading required package: stats4

## Loading required package: BiocGenerics

## 
## Attaching package: 'BiocGenerics'

## The following object is masked from 'package:SeuratObject':
## 
##     intersect

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, table,
##     tapply, union, unique, unsplit, which.max, which.min

## Loading required package: Biobase

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

## Loading required package: IRanges

## Loading required package: S4Vectors

## 
## Attaching package: 'S4Vectors'

## The following object is masked from 'package:clusterProfiler':
## 
##     rename

## The following objects are masked from 'package:data.table':
## 
##     first, second

## The following object is masked from 'package:utils':
## 
##     findMatches

## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname

## 
## Attaching package: 'IRanges'

## The following object is masked from 'package:clusterProfiler':
## 
##     slice

## The following object is masked from 'package:data.table':
## 
##     shift

## The following object is masked from 'package:sp':
## 
##     %over%

## 
## Attaching package: 'AnnotationDbi'

## The following object is masked from 'package:clusterProfiler':
## 
##     select

##

library(msigdbr) # access to MSigDB

1 Introduction

In this document we are going to use an already created Seurat scRNA object. We are then going to identify cluster markers, and submit those cluster markers for over-representation (enrichment) analysis with MSigDB. We will then make some plots representing the most significant pathways.

2 Data

I’m using a previously published dataset that I’ve reanalyzed for other example scripts.

# Loading in the data
obj <- readRDS('./data/seurat_obj_full.Rds')

# Visualizing the data
DimPlot(obj)

We are going to rerun the clustering to identify a smaller number of clusters. Many of these clusters will share similar transcriptomic profiles. For the speed/visualizations of my downstream example it’ll be easier working with fewer clusters.

obj <- FindClusters(obj, resolution = .05)

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 36827
## Number of edges: 1171837
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9915
## Number of communities: 14
## Elapsed time: 4 seconds

# Projecting the data in umap space
DimPlot(obj)

3 Finding Cluster Marker Genes

# Useful function to find markers for each cluster compared to all others
markers <- FindAllMarkers(object = obj,
                          only.pos = T, # focusing on upregulated markers
                          test.use = 'wilcox', # the default test that runs quickly
                          logfc.threshold = .2 # setting a strong cutoff 
                          )

## Warning: The `slot` argument of `GetAssayData()` is deprecated as of SeuratObject 5.0.0.
## ℹ Please use the `layer` argument instead.
## ℹ The deprecated feature was likely used in the Seurat package.
##   Please report the issue at <https://github.com/satijalab/seurat/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: `PackageCheck()` was deprecated in SeuratObject 5.0.0.
## ℹ Please use `rlang::check_installed()` instead.
## ℹ The deprecated feature was likely used in the Seurat package.
##   Please report the issue at <https://github.com/satijalab/seurat/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Next we are going to extract the differentially expressed genes for each cluster

# Creating an empty list to add to
marker_list <- list()

# A for loop to loop through the clusters
# You can replace obj$seurat_clusters with your cluster identity
for (i in levels(obj$seurat_clusters)){
  
  # Print statements are always useful in for loops to track progress and to 
  # make sure it's doing what it's suppose to
  print(paste0("Cluster ", i))
  
  # Isolating markers for the given cluster
  temp <- markers[markers$cluster == i,]
  
  # Only keeping significant markers
  temp <- temp[temp$p_val_adj < .05,]
  
  # Printing a statement saying how many markers we have
  print(paste0("Contains ", dim(temp)[1], ' markers'))
  
  # Adding the markers/gene to the list indexed by cluster
  marker_list[[i]] <- temp$gene
}

## [1] "Cluster 0"
## [1] "Contains 3240 markers"
## [1] "Cluster 1"
## [1] "Contains 5091 markers"
## [1] "Cluster 2"
## [1] "Contains 2368 markers"
## [1] "Cluster 3"
## [1] "Contains 5532 markers"
## [1] "Cluster 4"
## [1] "Contains 3057 markers"
## [1] "Cluster 5"
## [1] "Contains 1246 markers"
## [1] "Cluster 6"
## [1] "Contains 2237 markers"
## [1] "Cluster 7"
## [1] "Contains 2042 markers"
## [1] "Cluster 8"
## [1] "Contains 1580 markers"
## [1] "Cluster 9"
## [1] "Contains 4231 markers"
## [1] "Cluster 10"
## [1] "Contains 1875 markers"
## [1] "Cluster 11"
## [1] "Contains 2724 markers"
## [1] "Cluster 12"
## [1] "Contains 2712 markers"
## [1] "Cluster 13"
## [1] "Contains 2101 markers"

4 MSigDB

We will use the molecular signatures database (MSigDB) to load their mouse collections to then run enrichment analysis with clusterProfiler

4.1 Database generation

In this chunk of code we will generate the database for use in clusterProfiler.

# Loading all gene sets present in MSigDB for mouse
all_gene_sets <- msigdbr(species = 'Mus musculus')

head(all_gene_sets) # looking at the first six rows of the data

## # A tibble: 6 × 23
##   gene_symbol ncbi_gene ensembl_gene db_gene_symbol db_ncbi_gene db_ensembl_gene
##   <chr>       <chr>     <chr>        <chr>          <chr>        <chr>          
## 1 Abcc4       239273    ENSMUSG0000… ABCC4          10257        ENSG00000125257
## 2 Abraxas2    109359    ENSMUSG0000… ABRAXAS2       23172        ENSG00000165660
## 3 Actn4       60595     ENSMUSG0000… ACTN4          81           ENSG00000130402
## 4 Acvr1       11477     ENSMUSG0000… ACVR1          90           ENSG00000115170
## 5 Adam9       11502     ENSMUSG0000… ADAM9          8754         ENSG00000168615
## 6 Adamts5     23794     ENSMUSG0000… ADAMTS5        11096        ENSG00000154736
## # ℹ 17 more variables: source_gene <chr>, gs_id <chr>, gs_name <chr>,
## #   gs_collection <chr>, gs_subcollection <chr>, gs_collection_name <chr>,
## #   gs_description <chr>, gs_source_species <chr>, gs_pmid <chr>,
## #   gs_geoid <chr>, gs_exact_source <chr>, gs_url <chr>, db_version <chr>,
## #   db_target_species <chr>, ortholog_taxon_id <int>, ortholog_sources <chr>,
## #   num_ortholog_sources <dbl>

Next we will look at the different collections in MSigDB.

unique(all_gene_sets$gs_collection_name)[order(unique(all_gene_sets$gs_collection_name))]

##  [1] "BioCarta Pathways"                   
##  [2] "Cancer Gene Neighborhoods"           
##  [3] "Cancer Modules"                      
##  [4] "Canonical Pathways"                  
##  [5] "Cell Type Signature"                 
##  [6] "Chemical and Genetic Perturbations"  
##  [7] "Curated Cancer Cell Atlas gene sets "
##  [8] "GO Biological Process"               
##  [9] "GO Cellular Component"               
## [10] "GO Molecular Function"               
## [11] "GTRD"                                
## [12] "Hallmark"                            
## [13] "HIPC Vaccine Response"               
## [14] "Human Phenotype Ontology"            
## [15] "ImmuneSigDB"                         
## [16] "KEGG Legacy Pathways"                
## [17] "KEGG Medicus Pathways"               
## [18] "MIR_Legacy"                          
## [19] "miRDB"                               
## [20] "Oncogenic Signature"                 
## [21] "PID Pathways"                        
## [22] "Positional"                          
## [23] "Reactome Pathways"                   
## [24] "TFT_Legacy"                          
## [25] "WikiPathways"

More details about the collections are available.

head(msigdbr_collections())

## # A tibble: 6 × 4
##   gs_collection gs_subcollection  gs_collection_name                num_genesets
##   <chr>         <chr>             <chr>                                    <int>
## 1 C1            ""                Positional                                 302
## 2 C2            "CGP"             Chemical and Genetic Perturbatio…         3494
## 3 C2            "CP"              Canonical Pathways                          19
## 4 C2            "CP:BIOCARTA"     BioCarta Pathways                          292
## 5 C2            "CP:KEGG_LEGACY"  KEGG Legacy Pathways                       186
## 6 C2            "CP:KEGG_MEDICUS" KEGG Medicus Pathways                      658

# Collecting unique rows in the database and creating the term to gene object
msigdbr_t2g <- dplyr::distinct(all_gene_sets, gs_name, gene_symbol)

4.2 Running enricher

Next we will run the enricher function on the top markers from each cluster.

pathway_results <- list()
for (i in names(marker_list)){
  print(i)
  
  res <- enricher(gene = marker_list[[i]], # cluster marker genes
                  TERM2GENE = msigdbr_t2g, # reference database
                  pvalueCutoff = 1)
  
  pathway_results[[i]] <- res
}

## [1] "0"
## [1] "1"
## [1] "2"
## [1] "3"
## [1] "4"
## [1] "5"
## [1] "6"
## [1] "7"
## [1] "8"
## [1] "9"
## [1] "10"
## [1] "11"
## [1] "12"
## [1] "13"

4.3 Looking at results for cluster 1

temp <- pathway_results[[1]]

temp

## #
## # over-representation test
## #
## #...@organism     UNKNOWN 
## #...@ontology     UNKNOWN 
## #...@gene     chr [1:3240] "Slc24a1" "Cnga1" "Nr2e3" "Cngb1" "Impg1" "Pde6a" "Samd11" ...
## #...pvalues adjusted by 'BH' with cutoff <1 
## #...4674 enriched terms found
## 'data.frame':    4674 obs. of  12 variables:
##  $ ID            : chr  "CRX_NRL_DN.V1_UP" "CRX_DN.V1_UP" "NRL_DN.V1_UP" "HP_NYCTALOPIA" ...
##  $ Description   : chr  "CRX_NRL_DN.V1_UP" "CRX_DN.V1_UP" "NRL_DN.V1_UP" "HP_NYCTALOPIA" ...
##  $ GeneRatio     : chr  "109/2824" "98/2824" "95/2824" "105/2824" ...
##  $ BgRatio       : chr  "137/17963" "132/17963" "136/17963" "218/17963" ...
##  $ RichFactor    : num  0.796 0.742 0.699 0.482 0.471 ...
##  $ FoldEnrichment: num  5.06 4.72 4.44 3.06 3 ...
##  $ zScore        : num  20.6 18.5 17.4 13.2 12.4 ...
##  $ pvalue        : num  5.15e-62 6.54e-51 1.53e-45 1.46e-29 1.92e-26 ...
##  $ p.adjust      : num  1.35e-57 8.61e-47 1.34e-41 9.62e-26 1.01e-22 ...
##  $ qvalue        : num  1.16e-57 7.36e-47 1.15e-41 8.23e-26 8.66e-23 ...
##  $ geneID        : chr  "Slc24a1/Cnga1/Nr2e3/Cngb1/Impg1/Pde6a/Samd11/Nrl/Reep6/Esrrb/Rdh12/Rtbdn/Pla2r1/Nxnl2/Kcnj14/Dpf3/Grtp1/Faim/Gu"| __truncated__ "Slc24a1/Cngb1/Pde6a/Reep6/Vtn/Esrrb/Rdh12/Rtbdn/Grk1/Abca4/Pla2r1/Nxnl2/Kcnj14/Dpf3/Grtp1/Faim/Guca1b/Pfkfb2/Cd"| __truncated__ "Slc24a1/Cnga1/Nr2e3/Cngb1/Impg1/Pde6a/Samd11/Nrl/Reep6/Esrrb/Pde6b/Pla2r1/Kcnj14/Dpf3/Grtp1/Faim/Bbs9/Cdr2/Nxnl"| __truncated__ "Slc24a1/Cnga1/Nr2e3/Cngb1/Impg1/Pde6a/Nrl/Reep6/Prom1/Pde6b/Rdh12/Grk1/Abca4/Ush2a/Aipl1/Fscn2/Guca1b/Pitpnm3/B"| __truncated__ ...
##  $ Count         : int  109 98 95 105 97 164 64 59 105 86 ...
## #...Citation
##  T Wu, E Hu, S Xu, M Chen, P Guo, Z Dai, T Feng, L Zhou, W Tang, L Zhan, X Fu, S Liu, X Bo, and G Yu.
##  clusterProfiler 4.0: A universal enrichment tool for interpreting omics data.
##  The Innovation. 2021, 2(3):100141

# Put it in dataframe form to query and plot
temp <- as.data.frame(temp) 

dim(temp)

## [1] 4674   12

head(temp)

##                                                          ID
## CRX_NRL_DN.V1_UP                           CRX_NRL_DN.V1_UP
## CRX_DN.V1_UP                                   CRX_DN.V1_UP
## NRL_DN.V1_UP                                   NRL_DN.V1_UP
## HP_NYCTALOPIA                                 HP_NYCTALOPIA
## HP_ABNORMAL_ELECTRORETINOGRAM HP_ABNORMAL_ELECTRORETINOGRAM
## GOBP_MRNA_PROCESSING                   GOBP_MRNA_PROCESSING
##                                                 Description GeneRatio   BgRatio
## CRX_NRL_DN.V1_UP                           CRX_NRL_DN.V1_UP  109/2824 137/17963
## CRX_DN.V1_UP                                   CRX_DN.V1_UP   98/2824 132/17963
## NRL_DN.V1_UP                                   NRL_DN.V1_UP   95/2824 136/17963
## HP_NYCTALOPIA                                 HP_NYCTALOPIA  105/2824 218/17963
## HP_ABNORMAL_ELECTRORETINOGRAM HP_ABNORMAL_ELECTRORETINOGRAM   97/2824 206/17963
## GOBP_MRNA_PROCESSING                   GOBP_MRNA_PROCESSING  164/2824 469/17963
##                               RichFactor FoldEnrichment   zScore       pvalue
## CRX_NRL_DN.V1_UP               0.7956204       5.060811 20.60664 5.148974e-62
## CRX_DN.V1_UP                   0.7424242       4.722439 18.53906 6.541578e-51
## NRL_DN.V1_UP                   0.6985294       4.443231 17.40835 1.530170e-45
## HP_NYCTALOPIA                  0.4816514       3.063705 13.24034 1.462337e-29
## HP_ABNORMAL_ELECTRORETINOGRAM  0.4708738       2.995151 12.43900 1.923784e-26
## GOBP_MRNA_PROCESSING           0.3496802       2.224258 11.60315 2.534509e-25
##                                   p.adjust       qvalue
## CRX_NRL_DN.V1_UP              1.354747e-57 1.158465e-57
## CRX_DN.V1_UP                  8.605773e-47 7.358931e-47
## NRL_DN.V1_UP                  1.342010e-41 1.147574e-41
## HP_NYCTALOPIA                 9.618888e-26 8.225262e-26
## HP_ABNORMAL_ELECTRORETINOGRAM 1.012334e-22 8.656624e-23
## GOBP_MRNA_PROCESSING          1.034829e-21 8.848982e-22
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               geneID
## CRX_NRL_DN.V1_UP                                                                                                                                                                                                                                                                                                                                      Slc24a1/Cnga1/Nr2e3/Cngb1/Impg1/Pde6a/Samd11/Nrl/Reep6/Esrrb/Rdh12/Rtbdn/Pla2r1/Nxnl2/Kcnj14/Dpf3/Grtp1/Faim/Guca1b/Bbs9/Cdr2/Nxnl1/Sntg2/Ankrd33b/Cabp4/Gpsm2/Drd4/Atp8a2/Kcnb1/Sntb2/Sh2d1a/Bicdl1/Arhgap10/Mef2c/Tnfaip3/Slc16a6/Polg2/Diaph3/Ppp1r42/Taok3/Pdia5/B4galt1/Slc25a25/Rnf207/Gnat1/Alpl/St8sia1/Ctif/Plpp2/Snta1/Ttc39c/Aqp1/Camk1d/Ext1/Rho/Kmt2c/Ccn4/Plekha2/Llgl2/Lrrc2/St3gal1/Crocc/Col19a1/Dhrs3/Klhl36/Gas7/Bst1/Prkab1/Ssbp4/Ahrr/Sap30/Sec14l2/Ubxn11/Tcp11/Serinc4/Hcls1/Ppara/Galnt10/Ier3/Stk17b/Dhh/Zfp36l2/Psd/Arg2/Ppargc1b/Pip4k2c/Pde8a/Armc1/Hspa1a/Mest/Tnip2/Hspa1b/Gadd45a/Edn2/Mlf1/Paqr4/Abhd14a/Nrm/2610528A11Rik/Rhpn1/Rnf123/Bub1b/Ckmt1/Slc16a3/Abcg4/Slc29a2/Cdkn2c/Eps8l1/Ptpn18
## CRX_DN.V1_UP                                                                                                                                                                                                                                                                                                                                                                                                                   Slc24a1/Cngb1/Pde6a/Reep6/Vtn/Esrrb/Rdh12/Rtbdn/Grk1/Abca4/Pla2r1/Nxnl2/Kcnj14/Dpf3/Grtp1/Faim/Guca1b/Pfkfb2/Cdr2/Nxnl1/Sntg2/Cabp4/Gpsm2/Fam161a/Sntb2/Sh2d1a/Bicdl1/Mdm1/Pacrg/Arhgap10/Mef2c/Tnfaip3/Ppp3cc/Rpgrip1/Tmlhe/Rabgef1/Polg2/Slc17a1/Ppp1r42/Synpo2/Taok3/Rcvrn/Plekhf2/Pdia5/B4galt1/Slc25a25/Rnf207/Gnat1/Alpl/Ankrd33/St8sia1/Plpp2/Snta1/Trafd1/Ttc39c/Aqp1/Ddhd1/Prph2/Stk35/Ccn4/Suv39h2/Ubtd1/Llgl2/Dock8/Jag1/Col19a1/Gas7/Bst1/Ubxn11/Tcp11/Serinc4/Hcls1/Ier3/Stk17b/Dhh/Dusp14/Wdr31/Arg2/Fam107b/Hspa1a/Hspa1b/Csgalnact2/Paqr4/Sh3yl1/Nrm/Rhpn1/Rnf123/Bub1b/Gpsm3/Bcar3/Klkb1/Faah/Rad51/Ckmt1/Alpk2/Tmem229b/Insl5/Cltrn
## NRL_DN.V1_UP                                                                                                                                                                                                                                                                                                                                                                                                                                           Slc24a1/Cnga1/Nr2e3/Cngb1/Impg1/Pde6a/Samd11/Nrl/Reep6/Esrrb/Pde6b/Pla2r1/Kcnj14/Dpf3/Grtp1/Faim/Bbs9/Cdr2/Nxnl1/Sntg2/Gpsm2/Atp8a2/Kcnb1/Sntb2/Sh2d1a/Bicdl1/Arhgap10/Mef2c/Tnfaip3/Slc16a6/Galnt7/Polg2/Taok3/Plekhf2/Ebpl/Pdia5/B4galt1/Rnf207/Gnat1/Alpl/St8sia1/Ctif/Plpp2/Snta1/Ttc39c/Aqp1/Ext1/Gnb1/Stk35/Rho/Kmt2c/Ccn4/Trpc1/Plekha2/Lmo1/Llgl2/Lrrc2/St3gal1/Kmt2a/Col19a1/Dhrs3/Klhl36/Gas7/Bst1/Ahrr/Eml3/Sap30/Styx/Sec14l2/Tcp11/Ppara/Galnt10/Stk17b/Zfp36l2/Dusp14/Arg2/Ppargc1b/Armc1/Cdk5r2/Plagl2/Gadd45a/Edn2/Mlf1/Supt6/Rhbdl3/Ccna2/2610528A11Rik/Rhpn1/Ngdn/Cldn23/Shisa2/Zdhhc6/Stard10/Slc29a2/Hic2
## HP_NYCTALOPIA                                                                                                                                                                                                                                                                                                                                                                                                                   Slc24a1/Cnga1/Nr2e3/Cngb1/Impg1/Pde6a/Nrl/Reep6/Prom1/Pde6b/Rdh12/Grk1/Abca4/Ush2a/Aipl1/Fscn2/Guca1b/Pitpnm3/Bbs9/Prcd/Tulp1/Impg2/Cabp4/Fam161a/Rbp3/Rp1l1/Elovl4/Mak/Kcnv2/Rp1/Tub/Cdhr1/Cerkl/Gucy2e/Rpgrip1/Bbs7/Crx/Bbs5/Gnat1/Cep290/Pcare/Adgrv1/Guca1a/Pcdh15/Arl6/Impdh1/Prph2/Rab28/Pde6g/Rom1/Rho/Arl3/Sag/Unc119/Lca5/Cacna1f/Whrn/Ttc8/Ahr/Hk1/Crb1/Cacna2d4/Ttll5/Poc1b/Arsg/Hadha/Cc2d2a/Kif3b/Bbs2/Bbs12/Rpgr/D630045J12Rik/Wdpcp/Ift74/Trnt1/Vps13b/Pex14/Snrnp200/Pomgnt1/Zpr1/Cep19/Lztfl1/Pex3/Phyh/Ift43/Dhdds/Prpf4/Prpf3/Pex1/Ift88/Stim1/Agbl5/Ggcx/Zfp513/Bbs4/Pex10/Cwc27/Exosc2/Mkks/Sdccag8/Ift172/Cog4/Bbs1/Dhx38/Ofd1
## HP_ABNORMAL_ELECTRORETINOGRAM                                                                                                                                                                                                                                                                                                                                                                                                                                                     Slc24a1/Cnga1/Nr2e3/Cngb1/Impg1/Pde6a/Nrl/Reep6/Prom1/Pde6b/Rdh12/Rs1/Grk1/Abca4/Ush2a/Aipl1/Fscn2/Guca1b/Bbs9/Prcd/Tulp1/Impg2/Cabp4/Fam161a/Rbp3/Rp1l1/Mak/Rp1/Tub/Cdhr1/Cerkl/Gucy2e/Rpgrip1/Bbs7/Crx/Bbs5/Gnat1/Cep290/Pcare/Adgrv1/Guca1a/Pcdh15/Arl6/Impdh1/Prph2/Pde6g/Gnb5/Rom1/Wwox/Rho/Arl3/Sag/Lca5/Cacna1f/Whrn/Ttc8/Ahr/Crb1/Cacna2d4/Pde6d/Ttll5/Rd3/Arsg/Hadha/Cc2d2a/Kif3b/Bbs2/Bbs12/Rpgr/D630045J12Rik/Wdpcp/Ift74/Pcyt1a/Large1/Snrnp200/Pomgnt1/Cep19/Lztfl1/Nup54/Dhdds/Prpf4/Prpf3/Pex1/Ift88/Agbl5/Zfp513/Rnf216/Bbs4/Dnajc5/Cwc27/Cln3/Mkks/Sdccag8/Ift172/Bbs1/Dhx38/Ofd1
## GOBP_MRNA_PROCESSING          Mbnl1/Ahcyl1/Pcbp4/Hnrnpu/Rsrp1/Ddx5/Mbnl2/Thoc5/Ssu72/Ybx1/Rbm20/Rrp1b/Ppargc1a/Hnrnpf/Hnrnph3/Son/Srpk2/Celf2/Rbm7/Srsf7/Thoc2/Raly/Ddx17/Upf3b/Ddx42/Zrsr2/Sf3a3/Upf3a/Sf3a1/Snrnp70/Snrpc/Ncbp2/Ppil2/Supt6/Dyrk1a/Hnrnpul1/Dcps/Tent4b/Cdk11b/Nsrp1/Prpf39/Eif4a3/Luc7l/Alyref/Cdk12/Mtpap/Ptcd2/Sf3b3/Sart1/Ncbp1/Htatsf1/Snrnp200/Pcif1/Kin/Zpr1/Cdc5l/Trmt10c/Srsf3/Ppil1/Casc3/Ppie/Fra10ac1/Clasrp/Leo1/Aar2/Acin1/Yju2/Ppwd1/Ppih/Sugp1/Rbm28/Prmt7/Ddx23/Ccar2/Wbp4/Sart3/Thrap3/Thoc6/Dazap1/Prpf40a/Srsf4/Tsen2/Prkrip1/Nsun2/Rbm14/Prpf4/Ess2/Safb/Cbll1/Prpf3/Eftud2/Smn1/Prpf40b/Alkbh5/Xab2/Exosc10/Upf1/Kat8/Xrn2/Sf3a2/Wtap/Dbr1/Nrde2/Cactin/Prmt9/Chd8/Cwc22/Sf1/Rbm25/Tent4a/Rbm10/Ccdc130/Tent2/Hnrnpl/Tcerg1/Papolg/Ccdc84/Rngtt/Nup98/Ppp1r8/Iws1/Cstf1/Srrm1/Gemin5/Pde12/Sympk/Rbm48/Tfip11/Cnot6l/Rbm15/Cstf2t/Mettl14/Pnpt1/Hsf1/Prmt5/Safb2/Rbm4/Trub2/Tra2a/Srrm2/Rbm15b/Cwc27/Tbrg4/Cpsf1/Dhx15/Snrnp27/Papolb/Gemin2/Cwc25/Rbm41/Cpsf4/Dhx35/Usp4/Ern1/Dhx8/Dhx38/Cpsf7/Ncbp3/Cmtr2/Cpsf6/Khdc4/Paxbp1/Apobec2/Dus3l
##                               Count
## CRX_NRL_DN.V1_UP                109
## CRX_DN.V1_UP                     98
## NRL_DN.V1_UP                     95
## HP_NYCTALOPIA                   105
## HP_ABNORMAL_ELECTRORETINOGRAM    97
## GOBP_MRNA_PROCESSING            164

Here we have the gene set name but not the collection they belong to, so we’ll need to map back to that.

# creating a dataframe with gene sets mapped to their collection name
collections <- as.data.frame(dplyr::distinct(all_gene_sets, gs_name, gs_collection_name))

# # Checking that they all overlap
# table(temp$ID %in% collections$gs_name)

rownames(collections) <- collections$gs_name
temp$collection <- collections[temp$ID,]$gs_collection_name

# Looking at how many entries we have for each collection in our results
table(temp$collection)

## 
##                    BioCarta Pathways            Cancer Gene Neighborhoods 
##                                   19                                  116 
##                       Cancer Modules                   Canonical Pathways 
##                                   44                                    1 
##                  Cell Type Signature   Chemical and Genetic Perturbations 
##                                   24                                  215 
## Curated Cancer Cell Atlas gene sets                 GO Biological Process 
##                                    4                                  475 
##                GO Cellular Component                GO Molecular Function 
##                                  134                                  180 
##                                 GTRD                             Hallmark 
##                                  123                                    6 
##                HIPC Vaccine Response             Human Phenotype Ontology 
##                                   28                                  685 
##                          ImmuneSigDB                 KEGG Legacy Pathways 
##                                 1181                                   11 
##                KEGG Medicus Pathways                           MIR_Legacy 
##                                   27                                  128 
##                                miRDB                  Oncogenic Signature 
##                                  866                                   20 
##                         PID Pathways                           Positional 
##                                   20                                   18 
##                    Reactome Pathways                           TFT_Legacy 
##                                  187                                  108 
##                         WikiPathways 
##                                   54

# Looking at the number of significant entries (TRUE) for each collection
table(temp$collection,temp$p.adjust < .05)

##                                       
##                                        FALSE TRUE
##   BioCarta Pathways                       15    4
##   Cancer Gene Neighborhoods               52   64
##   Cancer Modules                          30   14
##   Canonical Pathways                       1    0
##   Cell Type Signature                     11   13
##   Chemical and Genetic Perturbations     115  100
##   Curated Cancer Cell Atlas gene sets      0    4
##   GO Biological Process                  244  231
##   GO Cellular Component                   54   80
##   GO Molecular Function                   86   94
##   GTRD                                    43   80
##   Hallmark                                 2    4
##   HIPC Vaccine Response                    7   21
##   Human Phenotype Ontology               364  321
##   ImmuneSigDB                            564  617
##   KEGG Legacy Pathways                     5    6
##   KEGG Medicus Pathways                   21    6
##   MIR_Legacy                              48   80
##   miRDB                                  440  426
##   Oncogenic Signature                     10   10
##   PID Pathways                             9   11
##   Positional                              14    4
##   Reactome Pathways                       87  100
##   TFT_Legacy                              58   50
##   WikiPathways                            32   22

4.3.1 WikiPathways Results

Next we’ll view and plot the WikiPathways results, one of the collections in MSigDB.

wiki <- temp[temp$collection == 'WikiPathways',] # subsetting to just Wiki results

wiki$id <- gsub('WP_','', wiki$ID) # removing the WP_ preceder from term IDs
wiki$id <- substr(wiki$id,1,30) # limiting to thirty characters for ID bc some get long
wiki$id <- factor(wiki$id, levels = rev(unique(wiki$id))) # ordering them by p-vals

ggplot(wiki[1:10,], # only plotting the first ten 
       aes(x = -log10(p.adjust), 
           y = id,
           color = FoldEnrichment,
           size = Count)) +
  theme_bw() + # my theme of choice
  geom_point() +
  scale_color_gradient(low = 'pink', high = 'darkred') + # creating a monotonic color scheme
  # creating a significance cutoff to let viewers know all values plotted are significant
  geom_vline(xintercept = -log10(0.05), colour = 'lightgrey', linetype = 2, linewidth = 1) +
  # increasing the xlims because the larger points were getting cut off
  xlim(c(-log10(0.05),-log10(min(wiki$p.adjust)) + 1)) +
  # cleaning up the labels on the graph
  labs(fill = 'Fold\nEnrichment', 
       size = 'Associated\nGenes') +
  xlab('-log10 FDR') +
  ylab("WikiPathway Terms") +
  ggtitle("Top Ten WikiPathway Terms Associated with C1")

Pathway Analysis w/ MSigDB

D. Ford Hannum

2025-06-10