YuLab-SMU/ProjectYulab

construct PPI network from ORA result

GuangchuangYu opened this issue · 18 comments

# an early -unpolished -alpha version of the script
`library(tidyverse)
library(httr)
library(jsonlite)
library(rlist)
library(magrittr)
library(igraph)
library(clipr)

genelist <- DEG_names

write_clip(genelist) # to make a copy and verify on string database

out_form <- "json" # output format includes:tsv, tsv-no-header, json, xml, psi-mi, psi-mi-tab
input_genes <- paste(genelist, collapse = "%0d")
url_genes <- paste("https://string-db.org/api",
out_form,
paste("network?identifiers=",input_genes,"&species=10090",sep = ""),
sep = "/")
response <-GET(url_genes)
result <- fromJSON(content(response, as="text"))

write.csv(result, "~/Desktop/test_PPI.csv") # output the result if needed

node <- unique(c(result$preferredName_A, # make sure all the genes are included
result$preferredName_B))
network <- graph_from_data_frame(d = result[,c(3,4,6)],
vertices = node,
directed=F)
deg <- degree(network, mode="all")
plot(network,
edge.arrow.size=.2, edge.curved=0,
vertex.size = deg,
vertex.color="orange", vertex.frame.color="firebrick",
vertex.shape = "circle",
vertex.label=node, vertex.label.color="black",
vertex.label.cex=.7) `

library(pacman)
pacman::p_load(tidyverse, httr, jsonlite, rlist, magrittr, igraph, clipr)
DEGs_inter <- function (genelist, species) {
  fromJSON(content(paste("https://string-db.org/api",
                         "json",
                         paste("network?identifiers=",
                               genelist %>% paste(collapse = "%0d"),
                               paste("&species=",species,sep = ""),
                               sep = ""),
                         sep = "/") %>% GET(),as="text"))
}
result <- DEGs_inter(genelist = DEG_names, species = 10090) # Mum,10090; Homo,9606; Rat,10116; etc.
write.csv(result, "~/Desktop/test_PPI.csv") # output the result if needed

network <- graph_from_data_frame(d = result[,c(3,4,6)], 
                                 vertices = unique(c(result$preferredName_A, 
                                                     result$preferredName_B)), 
                                 directed=F) 
plot(network, 
     edge.arrow.size=.2, edge.curved=0,
     vertex.size = degree(network, mode="all"),
     vertex.color="orange", vertex.frame.color="firebrick",
     vertex.shape = "circle",
     vertex.label=node, vertex.label.color="black",
     vertex.label.cex=.7) 

An reproducible Error has been found when combining the use of "org.Mm.eg.db" and the customized function "DEGs_inter".
DEGs_inter <- function (genelist, species) { fromJSON(content(paste("https://string-db.org/api", "json", paste("network?identifiers=", genelist %>% paste(collapse = "%0d"), paste("&species=",species,sep = ""), sep = ""), sep = "/") %>% GET(),as="text")) }
Once the "org.Mm.eg.db" is loaded, the content() function will always report an error, regardless of whether the functions in "org.Mm.eg.db" are called or not. Can't figure out the reason for now.😢

Problem Nailed by avoiding the use of content(). Now the script is primed to work with the result of enrichKEGG() or enrichGO().

library(pacman)
pacman::p_load(tidyverse, httr, jsonlite, rlist, magrittr, igraph, clipr, clusterProfiler, org.Mm.eg.db)
DEGs_inter <- function (genelist, species) {
  res <- paste("https://string-db.org/api/json",
               paste("network?identifiers=",genelist,
                     paste("&species=",species,sep = ""),
                     sep = ""),
               sep = "/") %>% GET()
  res$content %>% rawToChar() %>% fromJSON()
}

### load your own data
DEG_test <- na.omit(read.csv("~/wonderland/test.csv", header = T, sep = ","))
DEG_test_id <- bitr(DEG_test$geneID, fromType = "ENSEMBL", toType = c("ENTREZID","SYMBOL"), OrgDb = 
                    "org.Mm.eg.db")
### enrichment analysis
kegg_test <- enrichKEGG(gene = DEG_test_id$ENTREZID, 
                      organism = "mmu", 
                      pAdjustMethod = "BH",
                      pvalueCutoff = 0.01,
                      qvalueCutoff = 0.05) %>% 
  setReadable(OrgDb = org.Mm.eg.db, keyType = "ENTREZID")
kegg_sig <- kegg_test@result[kegg_test@result$p.adjust < 0.05,]

pathway_gene_id <- kegg_sig[kegg_sig$Description == "Apoptosis",]$geneID %>%   #or your own interested pathway
  strsplit("/") %>% unlist() %>% paste(collapse = "%0d")
result <- DEGs_inter(genelist = pathway_gene_id, species = 10090)
#write.csv(result, "~/Desktop/test_PPI.csv") # output the result if needed.

node <- unique(c(result$preferredName_A,  result$preferredName_B))
network <- graph_from_data_frame(d = result[,c(3,4,6)], 
                                 vertices = node, 
                                 directed=F) 

plot(network, 
     edge.arrow.size=.2, edge.curved=0,
     vertex.size = network %>% degree(mode = "all"),
     vertex.color="orange", vertex.frame.color="firebrick",
     vertex.shape = "circle",
     vertex.label=node, vertex.label.color="black",
     vertex.label.cex=.7) 

Currently, all the nodes in the plot are belonging to the selected pathway. Is it possible to incorporate genes that directly interact with the input genes?

As STRING use taxonomy ID, we need a function to get the ID for users.

Here is a prototype:

getTaxID <- function(species) {
    if (inherits(species, "OrgDb")) {
        m <- metadata(species)
        res <- m$value[m$name == "TAXID"]
        return(res)
    }

    ## query from a lookup table that stores kegg.code to taxa.id
}

getTaxID(org.Hs.eg.db)
# [1] "9606"

A lookup table is something like this https://github.com/datapplab/SBGNhub/blob/9f94cde57c721e5392dfa1e574ca445d4d06d23b/data/species.specifid.pathway_pathwayCommons/trees/kegg.code.to.taxa.tsv.

However, the above table is outdated. We need to prepare an up-to-date version.

Luckily, I just found that the Ensembl Project has provided an Api for extracting taxonomy ID (https://rest.ensembl.org/documentation/info/taxonomy_id). It might also be a good way to get to it.

library(httr)
library(jsonlite)
library(magrittr)
getTax_id <- function(species) {
  res_url <- paste("https://rest.ensembl.org/taxonomy/id", 
                   (strsplit(species," ") %>% unlist() %>% paste(collapse = "%20") %>% 
                      paste("?application/json", sep = "")), 
                   sep = "/") %>% GET() 
  res <- res_url$content %>% rawToChar() %>% fromJSON()
  res$id
}
getTax_id("Mus Musculus")
# [1] "10090"
getTax_id("Homo Sapiens")
# [1] "9606"
getTax_id("Acetitomaculum ruminis DSM 5522")
# [1] "1120918"

A new function has been built to better cooperate with the result passed from enrichKEGG() or enrichGO (). Now the users can easily get the input gene list by using cat_id().

cat_id <- function(enrich_res, show_Category) {
                                input_res <- as.data.frame(enrich_res@result)
                                input_res[input_res$Description %in% show_Category,]$geneID %>% 
                                strsplit("/") %>% 
                                unlist() %>% 
                                paste(collapse = "%0d")
}

cat_id(kegg_result, show_Category = c("Apoptosis", "Cell cycle"))

Currently, all the nodes in the plot are belonging to the selected pathway. Is it possible to incorporate genes that directly interact with the input genes?

Now the partners that possibly interacted with input genes could easily be obtained by using getPartner_net().

getPartner_net <- function(genelist, species, limit) {
  res <- paste("https://string-db.org/api/json",
               paste("interaction_partners?identifiers=",genelist,
                     paste("&species=",species,sep = ""),
                     paste("&limit=",limit,sep = ""),  # The number of wanted partner genes.
                     sep = ""),
               sep = "/") %>% GET()
  res$content %>% rawToChar() %>% fromJSON()
}

Let's finish this part first.

getTaxID <- function(species) {
    if (inherits(species, "OrgDb")) {
        m <- metadata(species)
        res <- m$value[m$name == "TAXID"]
        return(res)
    }

    ## load cached taxonomy ID information
    ## kegg.code taxa.id
    ## e.g. hsa 9606
    ##
    ## use this function (will call `getTaxaInfo`) to prepare the cached data
    ## species list from https://rest.kegg.jp/list/organism
    ## extract species from the third column
    ## use second column as kegg.code 

    i <- which(kegg_taxa$kegg.code == species)
    if (length(i) != 0)
    return(kegg_taxa$taxa.id[i])

    res <- getTaxaInfo(species)
    return(res$id)
}

getTaxInfo <- function(species) {
    url <- paste0(
        "https://rest.ensembl.org/taxonomy/id/",
        sub(" ", "%20", species),
        "?application/json"
    )

    fromJSON(url)
}

Em... after trying the code I have to admit getting things done as locally as we could is a more elegant way. Let me try to keep improving it.

  1. Here is an example code for access data from an api url;
  2. If you would like to format your messages in Markdown, you can use triple backticks (```) before and after the start and end lines of your code. @Potato-tudou
library(httr)

# Data fetch --------------------------------------------------------------
# Set stringDB base URL
address <- "https://string-db.org"

# Validate the address
httr::stop_for_status(httr::GET(address))

# Version info
current_version <- read.table(url(paste(address, "/api/tsv/version", sep = "")), header = TRUE)
available_version <- read.table(url(paste(address, "/api/tsv/available_api_versions", sep = "")), header = TRUE)

#' networkParamsParser
#' 
#' parameters parser for [Getting the STRING network interactions](https://string-db.org/cgi/help.pl?sessionId=btsvnCeNrBk7).
#'
#' @param identifiers required parameter for multiple items, e.g. `c("PTCH1", "TP53", "BRCA1", "BRCA2")`
#' @param species NCBI taxon identifiers (e.g. Human is 9606, see: [STRING organisms](https://string-db.org/cgi/input.pl?input_page_active_form=organisms).
#' @param required_score threshold of significance to include a interaction, a number between 0 and 1000 (default depends on the network)
#' @param network_type network type: functional (default), physical
#' @param add_nodes adds a number of proteins with to the network based on their confidence score (default:1)
#' @param show_query_node_labels when available use submitted names in the preferredName column when (0 or 1) (default:0)
#' @param caller_identity your identifier for us.
#'
#' @return a list contain parameters for query
networkParamsParser <- function(
    identifiers,
    species,
    required_score = NULL,
    network_type = "functional",
    add_nodes = 1,
    show_query_node_labels = 0,
    caller_identity = NULL
) {
  # Format the identifiers
  identifiers <- paste(identifiers, collapse = "\n")
  
  # Check parameters
  if (missing(species)) {
    stop("Please provide an NCBI taxon identifier for the species.")
  }
  
  # Create parameters list
  params <- list(
    identifiers = identifiers,
    species = species,
    required_score = required_score,
    network_type = network_type,
    add_nodes = add_nodes,
    show_query_node_labels = show_query_node_labels,
    caller_identity = caller_identity
  )
  
  # Remove NULL elements from the list
  filtered_params <- Filter(Negate(is.null), params)
  return(filtered_params)
}

networkParams <- networkParamsParser(
  identifiers = c("PTCH1", "TP53", "BRCA1", "BRCA2"),
  species = 9606,
  network_type = "functional",
  add_nodes = 1,
  show_query_node_labels = 0
)

# Define a cache directory
cache_dir <- tempdir()

# Define a function to read the file with caching
read_tsv_with_cache <- memoise::memoise(function(file_url,header = FALSE) {
  # Generate a unique cache filename based on the file URL
  cache_filename <- fs::path_join(c(cache_dir, paste0(digest::digest(file_url), ".rds")))
  
  # Check if the cached file exists
  if (file.exists(cache_filename)) {
    # If cached file exists, load and return the cached data
    cached_data <- readRDS(cache_filename)
    return(cached_data)
  } else {
    # If cached file does not exist, download and cache the data
    data <- read.delim(url(file_url), sep = "\t", header = header)
    saveRDS(data, cache_filename)
    return(data)
  }
})

# read data from stringDB api
response <- httr::GET(paste(address, "/api/tsv/network", sep = ""), query = networkParams)
res <- read_tsv_with_cache(response$url, header = TRUE)

# read data from species code list
# Define the URLs for the files
kegg_species_url <- 'https://rest.kegg.jp/list/organism'
stringdb_species_url <- "https://stringdb-static.org/download/species.v11.5.txt"

# Read the files with caching
kegg_species <- read_tsv_with_cache(kegg_species_url)
stringdb_species <- read_tsv_with_cache(stringdb_species_url,header = TRUE)

COOL. As a beginner, needs more than one night to read and digest your code. Thanx.

Thanks, @Potato-tudou and @xiayh17 . We now have the first version of getTaxID and getPPI functions in clusterProfiler, see YuLab-SMU/clusterProfiler@d27ae96.

详见公众号推送:当clusterProfiler遇见stringdb...