construct PPI network from ORA result

Question

construct PPI network from ORA result

GuangchuangYu opened this issue a year ago · 18 comments

go through the stringdb api document, https://cn.string-db.org/cgi/access?footer_active_subpage=apis
support querying network from stringdb with input of DEGs or subset of DEGs that are significantly enriched in a particular pathway.
visualize the network w/wo enrichment information.

Answer 1 · 2023-05-05T03:26:53.000Z

# an early -unpolished -alpha version of the script
`library(tidyverse)
library(httr)
library(jsonlite)
library(rlist)
library(magrittr)
library(igraph)
library(clipr)

genelist <- DEG_names

write_clip(genelist) # to make a copy and verify on string database

out_form <- "json" # output format includes：tsv, tsv-no-header, json, xml, psi-mi, psi-mi-tab
input_genes <- paste(genelist, collapse = "%0d")
url_genes <- paste("https://string-db.org/api",
out_form,
paste("network?identifiers=",input_genes,"&species=10090",sep = ""),
sep = "/")
response <-GET(url_genes)
result <- fromJSON(content(response, as="text"))

write.csv(result, "~/Desktop/test_PPI.csv") # output the result if needed

node <- unique(c(result$preferredName_A, # make sure all the genes are included
result$preferredName_B))
network <- graph_from_data_frame(d = result[,c(3,4,6)],
vertices = node,
directed=F)
deg <- degree(network, mode="all")
plot(network,
edge.arrow.size=.2, edge.curved=0,
vertex.size = deg,
vertex.color="orange", vertex.frame.color="firebrick",
vertex.shape = "circle",
vertex.label=node, vertex.label.color="black",
vertex.label.cex=.7) `

Answer 2 · 2023-05-05T07:34:45.000Z

library(pacman)
pacman::p_load(tidyverse, httr, jsonlite, rlist, magrittr, igraph, clipr)
DEGs_inter <- function (genelist, species) {
  fromJSON(content(paste("https://string-db.org/api",
                         "json",
                         paste("network?identifiers=",
                               genelist %>% paste(collapse = "%0d"),
                               paste("&species=",species,sep = ""),
                               sep = ""),
                         sep = "/") %>% GET(),as="text"))
}
result <- DEGs_inter(genelist = DEG_names, species = 10090) # Mum,10090; Homo,9606; Rat,10116; etc.
write.csv(result, "~/Desktop/test_PPI.csv") # output the result if needed

network <- graph_from_data_frame(d = result[,c(3,4,6)], 
                                 vertices = unique(c(result$preferredName_A, 
                                                     result$preferredName_B)), 
                                 directed=F) 
plot(network, 
     edge.arrow.size=.2, edge.curved=0,
     vertex.size = degree(network, mode="all"),
     vertex.color="orange", vertex.frame.color="firebrick",
     vertex.shape = "circle",
     vertex.label=node, vertex.label.color="black",
     vertex.label.cex=.7)

Answer 3 · 2023-05-06T13:39:45.000Z

An reproducible Error has been found when combining the use of "org.Mm.eg.db" and the customized function "DEGs_inter".
DEGs_inter <- function (genelist, species) { fromJSON(content(paste("https://string-db.org/api", "json", paste("network?identifiers=", genelist %>% paste(collapse = "%0d"), paste("&species=",species,sep = ""), sep = ""), sep = "/") %>% GET(),as="text")) }
Once the "org.Mm.eg.db" is loaded, the content() function will always report an error, regardless of whether the functions in "org.Mm.eg.db" are called or not. Can't figure out the reason for now.😢

Answer 4 · 2023-05-07T06:50:07.000Z

Problem Nailed by avoiding the use of content(). Now the script is primed to work with the result of enrichKEGG() or enrichGO().

library(pacman)
pacman::p_load(tidyverse, httr, jsonlite, rlist, magrittr, igraph, clipr, clusterProfiler, org.Mm.eg.db)
DEGs_inter <- function (genelist, species) {
  res <- paste("https://string-db.org/api/json",
               paste("network?identifiers=",genelist,
                     paste("&species=",species,sep = ""),
                     sep = ""),
               sep = "/") %>% GET()
  res$content %>% rawToChar() %>% fromJSON()
}

### load your own data
DEG_test <- na.omit(read.csv("~/wonderland/test.csv", header = T, sep = ","))
DEG_test_id <- bitr(DEG_test$geneID, fromType = "ENSEMBL", toType = c("ENTREZID","SYMBOL"), OrgDb = 
                    "org.Mm.eg.db")
### enrichment analysis
kegg_test <- enrichKEGG(gene = DEG_test_id$ENTREZID, 
                      organism = "mmu", 
                      pAdjustMethod = "BH",
                      pvalueCutoff = 0.01,
                      qvalueCutoff = 0.05) %>% 
  setReadable(OrgDb = org.Mm.eg.db, keyType = "ENTREZID")
kegg_sig <- kegg_test@result[kegg_test@result$p.adjust < 0.05,]

pathway_gene_id <- kegg_sig[kegg_sig$Description == "Apoptosis",]$geneID %>%   #or your own interested pathway
  strsplit("/") %>% unlist() %>% paste(collapse = "%0d")
result <- DEGs_inter(genelist = pathway_gene_id, species = 10090)
#write.csv(result, "~/Desktop/test_PPI.csv") # output the result if needed.

node <- unique(c(result$preferredName_A,  result$preferredName_B))
network <- graph_from_data_frame(d = result[,c(3,4,6)], 
                                 vertices = node, 
                                 directed=F) 

plot(network, 
     edge.arrow.size=.2, edge.curved=0,
     vertex.size = network %>% degree(mode = "all"),
     vertex.color="orange", vertex.frame.color="firebrick",
     vertex.shape = "circle",
     vertex.label=node, vertex.label.color="black",
     vertex.label.cex=.7)

Answer 5 · 2023-05-08T03:55:53.000Z

Currently, all the nodes in the plot are belonging to the selected pathway. Is it possible to incorporate genes that directly interact with the input genes?

Answer 6 · 2023-05-08T04:17:53.000Z

As STRING use taxonomy ID, we need a function to get the ID for users.

Here is a prototype:

getTaxID <- function(species) {
    if (inherits(species, "OrgDb")) {
        m <- metadata(species)
        res <- m$value[m$name == "TAXID"]
        return(res)
    }

    ## query from a lookup table that stores kegg.code to taxa.id
}

getTaxID(org.Hs.eg.db)
# [1] "9606"

Answer 7 · 2023-05-08T04:22:44.000Z

Cool!! Need few more hours to think it through. Very helpful. 😬😬

Answer 8 · 2023-05-08T04:26:46.000Z

A lookup table is something like this https://github.com/datapplab/SBGNhub/blob/9f94cde57c721e5392dfa1e574ca445d4d06d23b/data/species.specifid.pathway_pathwayCommons/trees/kegg.code.to.taxa.tsv.

However, the above table is outdated. We need to prepare an up-to-date version.

Answer 9 · 2023-05-08T06:36:24.000Z

Luckily, I just found that the Ensembl Project has provided an Api for extracting taxonomy ID (https://rest.ensembl.org/documentation/info/taxonomy_id). It might also be a good way to get to it.

library(httr)
library(jsonlite)
library(magrittr)
getTax_id <- function(species) {
  res_url <- paste("https://rest.ensembl.org/taxonomy/id", 
                   (strsplit(species," ") %>% unlist() %>% paste(collapse = "%20") %>% 
                      paste("?application/json", sep = "")), 
                   sep = "/") %>% GET() 
  res <- res_url$content %>% rawToChar() %>% fromJSON()
  res$id
}
getTax_id("Mus Musculus")
# [1] "10090"
getTax_id("Homo Sapiens")
# [1] "9606"
getTax_id("Acetitomaculum ruminis DSM 5522")
# [1] "1120918"

Answer 10 · 2023-05-08T15:59:26.000Z

A new function has been built to better cooperate with the result passed from enrichKEGG() or enrichGO (). Now the users can easily get the input gene list by using cat_id().

cat_id <- function(enrich_res, show_Category) {
                                input_res <- as.data.frame(enrich_res@result)
                                input_res[input_res$Description %in% show_Category,]$geneID %>% 
                                strsplit("/") %>% 
                                unlist() %>% 
                                paste(collapse = "%0d")
}

cat_id(kegg_result, show_Category = c("Apoptosis", "Cell cycle"))

Answer 11 · 2023-05-09T08:05:38.000Z

Currently, all the nodes in the plot are belonging to the selected pathway. Is it possible to incorporate genes that directly interact with the input genes?

Now the partners that possibly interacted with input genes could easily be obtained by using getPartner_net().

getPartner_net <- function(genelist, species, limit) {
  res <- paste("https://string-db.org/api/json",
               paste("interaction_partners?identifiers=",genelist,
                     paste("&species=",species,sep = ""),
                     paste("&limit=",limit,sep = ""),  # The number of wanted partner genes.
                     sep = ""),
               sep = "/") %>% GET()
  res$content %>% rawToChar() %>% fromJSON()
}

Answer 12 · 2023-05-11T03:59:30.000Z

Let's finish this part first.

getTaxID <- function(species) {
    if (inherits(species, "OrgDb")) {
        m <- metadata(species)
        res <- m$value[m$name == "TAXID"]
        return(res)
    }

    ## load cached taxonomy ID information
    ## kegg.code taxa.id
    ## e.g. hsa 9606
    ##
    ## use this function (will call `getTaxaInfo`) to prepare the cached data
    ## species list from https://rest.kegg.jp/list/organism
    ## extract species from the third column
    ## use second column as kegg.code 

    i <- which(kegg_taxa$kegg.code == species)
    if (length(i) != 0)
    return(kegg_taxa$taxa.id[i])

    res <- getTaxaInfo(species)
    return(res$id)
}

getTaxInfo <- function(species) {
    url <- paste0(
        "https://rest.ensembl.org/taxonomy/id/",
        sub(" ", "%20", species),
        "?application/json"
    )

    fromJSON(url)
}

Answer 13 · 2023-05-11T05:41:52.000Z

Em... after trying the code I have to admit getting things done as locally as we could is a more elegant way. Let me try to keep improving it.

Answer 14 · 2023-05-12T04:14:44.000Z

Here is an example code for access data from an api url;
If you would like to format your messages in Markdown, you can use triple backticks (```) before and after the start and end lines of your code. @Potato-tudou

library(httr)

# Data fetch --------------------------------------------------------------
# Set stringDB base URL
address <- "https://string-db.org"

# Validate the address
httr::stop_for_status(httr::GET(address))

# Version info
current_version <- read.table(url(paste(address, "/api/tsv/version", sep = "")), header = TRUE)
available_version <- read.table(url(paste(address, "/api/tsv/available_api_versions", sep = "")), header = TRUE)

#' networkParamsParser
#' 
#' parameters parser for [Getting the STRING network interactions](https://string-db.org/cgi/help.pl?sessionId=btsvnCeNrBk7).
#'
#' @param identifiers required parameter for multiple items, e.g. `c("PTCH1", "TP53", "BRCA1", "BRCA2")`
#' @param species NCBI taxon identifiers (e.g. Human is 9606, see: [STRING organisms](https://string-db.org/cgi/input.pl?input_page_active_form=organisms).
#' @param required_score threshold of significance to include a interaction, a number between 0 and 1000 (default depends on the network)
#' @param network_type network type: functional (default), physical
#' @param add_nodes adds a number of proteins with to the network based on their confidence score (default:1)
#' @param show_query_node_labels when available use submitted names in the preferredName column when (0 or 1) (default:0)
#' @param caller_identity your identifier for us.
#'
#' @return a list contain parameters for query
networkParamsParser <- function(
    identifiers,
    species,
    required_score = NULL,
    network_type = "functional",
    add_nodes = 1,
    show_query_node_labels = 0,
    caller_identity = NULL
) {
  # Format the identifiers
  identifiers <- paste(identifiers, collapse = "\n")
  
  # Check parameters
  if (missing(species)) {
    stop("Please provide an NCBI taxon identifier for the species.")
  }
  
  # Create parameters list
  params <- list(
    identifiers = identifiers,
    species = species,
    required_score = required_score,
    network_type = network_type,
    add_nodes = add_nodes,
    show_query_node_labels = show_query_node_labels,
    caller_identity = caller_identity
  )
  
  # Remove NULL elements from the list
  filtered_params <- Filter(Negate(is.null), params)
  return(filtered_params)
}

networkParams <- networkParamsParser(
  identifiers = c("PTCH1", "TP53", "BRCA1", "BRCA2"),
  species = 9606,
  network_type = "functional",
  add_nodes = 1,
  show_query_node_labels = 0
)

# Define a cache directory
cache_dir <- tempdir()

# Define a function to read the file with caching
read_tsv_with_cache <- memoise::memoise(function(file_url,header = FALSE) {
  # Generate a unique cache filename based on the file URL
  cache_filename <- fs::path_join(c(cache_dir, paste0(digest::digest(file_url), ".rds")))
  
  # Check if the cached file exists
  if (file.exists(cache_filename)) {
    # If cached file exists, load and return the cached data
    cached_data <- readRDS(cache_filename)
    return(cached_data)
  } else {
    # If cached file does not exist, download and cache the data
    data <- read.delim(url(file_url), sep = "\t", header = header)
    saveRDS(data, cache_filename)
    return(data)
  }
})

# read data from stringDB api
response <- httr::GET(paste(address, "/api/tsv/network", sep = ""), query = networkParams)
res <- read_tsv_with_cache(response$url, header = TRUE)

# read data from species code list
# Define the URLs for the files
kegg_species_url <- 'https://rest.kegg.jp/list/organism'
stringdb_species_url <- "https://stringdb-static.org/download/species.v11.5.txt"

# Read the files with caching
kegg_species <- read_tsv_with_cache(kegg_species_url)
stringdb_species <- read_tsv_with_cache(stringdb_species_url,header = TRUE)

Answer 15 · 2023-05-12T16:47:52.000Z

COOL. As a beginner, needs more than one night to read and digest your code. Thanx.

Answer 16 · 2023-05-15T00:00:10.000Z

Thanks, @Potato-tudou and @xiayh17 . We now have the first version of getTaxID and getPPI functions in clusterProfiler, see YuLab-SMU/clusterProfiler@d27ae96.

Answer 17 · 2023-05-15T00:06:14.000Z

Thanks for the opportunity and very happy to be part of it.😬

😬

😬 From Potato's iPhone. 1Message ID: ***@***.***>

Answer 18 · 2023-06-30T03:57:29.000Z

详见公众号推送：当clusterProfiler遇见stringdb...