ramiromagno/gwasrapidd

NCBI build conversion

jinjinzhou opened this issue · 8 comments

No information about the build of GWAS catalog which is in build hg38.
Does your package support the liftover conversation to, for example, hg19?

Hi Jin,

Thank you for your question.

So, no, gwasrapidd won't do the liftover for you. But it is not too hard to do with Bioconductor's liftOver.

If you would like help with lifting over some gwasrapidd results, perhaps you can provide a code example of what you trying to achieve and I will be happy to help.

See if this helps:

if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager")
BiocManager::install('liftOver')
BiocManager::install('rtracklayer')

library(tidyverse)
library(gwasrapidd)
library(liftOver)

variants <- gwasrapidd::get_variants(efo_trait = 'high density lipoprotein cholesterol measurement')

# For some reason, sometimes, GWAS Catalog does not have the genomic coordinates of some
# of the variants; these cannot be obviously lifted over, so are here eliminated.
variants_to_be_kept <- which(!is.na(variants@variants$chromosome_name) & !is.na(variants@variants$chromosome_position))

# Using gwasrapidd subsetting by position to keep only those variants for which there is 
# genomic coordinates.
variants2 <- variants[variants_to_be_kept]

# To check how many variants we lost:
gwasrapidd::n(variants)
gwasrapidd::n(variants2)

# Make a GRanges object from the genomic positions (hg38 assembly)
gr_hg38 <- GRanges(
  seqnames = as(variants2@variants$chromosome_name, "Rle"),
  ranges = IRanges(start = variants2@variants$chromosome_position, end = variants2@variants$chromosome_position, names = variants2@variants$variant_id),
  strand = "*"
)

# Loading the file with the information on how to liftover from hg38 to hg19
chain_file <- system.file(package="liftOver", "extdata", "hg38ToHg19.over.chain")
ch <- rtracklayer::import.chain(chain_file)

## Performing the actual liftover from hg38 to hg19
GenomeInfoDb::seqlevelsStyle(gr_hg38) <- "UCSC"  # necessary
gr_hg19 <- unlist(rtracklayer::liftOver(gr_hg38, ch))

diff <- setdiff(names(gr_hg38), names(gr_hg19))
message('There are ', length(diff), ' variants lost in the liftover from hg38 to hg19: ', paste(diff, collapse=", "), '.')

tbl_hg19 <- as.data.frame(gr_hg19) %>%
  dplyr::as_tibble(gr_hg19, rownames = 'variant_id') %>%
  dplyr::mutate(chromosome_name = stringr::str_remove(seqnames, '^chr'),
                chromosome_position = start) %>%
  dplyr::select(variant_id, chromosome_name, chromosome_position)

The same but with some output interwoven:

# if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager")
# BiocManager::install('liftOver')
# BiocManager::install('rtracklayer')

library(tidyverse)
library(gwasrapidd)
library(liftOver)

variants <- gwasrapidd::get_variants(efo_trait = 'high density lipoprotein cholesterol measurement')

variants@variants
#> # A tibble: 2,455 x 7
#>    variant_id  merged functional_class   chromosome_name chromosome_position
#>    <chr>        <int> <chr>              <chr>                         <int>
#>  1 rs151105710      1 intron_variant     3                         136406837
#>  2 rs68148663       1 intron_variant     1                          62687529
#>  3 rs201510740      1 intron_variant     1                          40221322
#>  4 rs28933094       1 missense_variant   15                         58563549
#>  5 rs182095422      1 intergenic_variant 6                          32701596
#>  6 rs75185853       1 intron_variant     11                         46729851
#>  7 rs5844832        1 intron_variant     22                         29059700
#>  8 rs77468106       1 intergenic_variant 20                         45646980
#>  9 rs149580368      1 intergenic_variant 17                         43797377
#> 10 rs553682607      1 intron_variant     10                         45564986
#> # … with 2,445 more rows, and 2 more variables: chromosome_region <chr>,
#> #   last_update_date <dttm>

# For some reason, sometimes, GWAS Catalog does not have the genomic coordinates of some
# of the variants; these cannot be obviously lifted over, so are here eliminated.
variants_to_be_kept <- which(!is.na(variants@variants$chromosome_name) & !is.na(variants@variants$chromosome_position))

# Using gwasrapidd subsetting by position to keep only those variants for which there is 
# genomic coordinates.
variants2 <- variants[variants_to_be_kept]
variants2@variants
#> # A tibble: 2,240 x 7
#>    variant_id  merged functional_class   chromosome_name chromosome_position
#>    <chr>        <int> <chr>              <chr>                         <int>
#>  1 rs151105710      1 intron_variant     3                         136406837
#>  2 rs68148663       1 intron_variant     1                          62687529
#>  3 rs201510740      1 intron_variant     1                          40221322
#>  4 rs28933094       1 missense_variant   15                         58563549
#>  5 rs182095422      1 intergenic_variant 6                          32701596
#>  6 rs75185853       1 intron_variant     11                         46729851
#>  7 rs5844832        1 intron_variant     22                         29059700
#>  8 rs77468106       1 intergenic_variant 20                         45646980
#>  9 rs149580368      1 intergenic_variant 17                         43797377
#> 10 rs553682607      1 intron_variant     10                         45564986
#> # … with 2,230 more rows, and 2 more variables: chromosome_region <chr>,
#> #   last_update_date <dttm>

# To check how many variants we lost:
gwasrapidd::n(variants)
#> [1] 2455
gwasrapidd::n(variants2)
#> [1] 2240

# These are the variants lost because of no info on genomic coordinates.
# These are mostly variants without SNP annotation (so either very rare or
# artifacts in the original works).
gwasrapidd::setdiff(variants, variants2)@variants
#> # A tibble: 215 x 7
#>    variant_id      merged functional_class chromosome_name chromosome_position
#>    <chr>            <int> <chr>            <chr>                         <int>
#>  1 Chr19:45422946       0 <NA>             <NA>                             NA
#>  2 chr1:93837780:D      0 <NA>             <NA>                             NA
#>  3 chr2:73538394        0 <NA>             <NA>                             NA
#>  4 chr17:15870401       0 <NA>             <NA>                             NA
#>  5 exm1417699           0 <NA>             <NA>                             NA
#>  6 chr15:77799744       0 <NA>             <NA>                             NA
#>  7 Chr8:126495818       0 <NA>             <NA>                             NA
#>  8 chr18:47179516       0 <NA>             <NA>                             NA
#>  9 chr8:20853599        0 <NA>             <NA>                             NA
#> 10 chr1:230297136       0 <NA>             <NA>                             NA
#> # … with 205 more rows, and 2 more variables: chromosome_region <chr>,
#> #   last_update_date <dttm>

# Make a GRanges object from the genomic positions (hg38 assembly)
gr_hg38 <- GRanges(
  seqnames = as(variants2@variants$chromosome_name, "Rle"),
  ranges = IRanges(start = variants2@variants$chromosome_position, end = variants2@variants$chromosome_position, names = variants2@variants$variant_id),
  strand = "*"
)

# Loading the file with the information on how to liftover from hg38 to hg19
chain_file <- system.file(package="liftOver", "extdata", "hg38ToHg19.over.chain")
ch <- rtracklayer::import.chain(chain_file)

## Performing the actual liftover from hg38 to hg19
GenomeInfoDb::seqlevelsStyle(gr_hg38) <- "UCSC"  # necessary
gr_hg19 <- unlist(rtracklayer::liftOver(gr_hg38, ch))

diff <- setdiff(names(gr_hg38), names(gr_hg19))
message('There are ', length(diff), ' variants lost in the liftover from hg38 to hg19: ', paste(diff, collapse=", "), '.')
#> There are 3 variants lost in the liftover from hg38 to hg19: rs453755, rs1839069, rs386000.

tbl_hg19 <- as.data.frame(gr_hg19) %>%
  dplyr::as_tibble(gr_hg19, rownames = 'variant_id') %>%
  dplyr::mutate(chromosome_name = stringr::str_remove(seqnames, '^chr'),
                chromosome_position = start) %>%
  dplyr::select(variant_id, chromosome_name, chromosome_position)

tbl_hg19
#> # A tibble: 2,237 x 3
#>    variant_id  chromosome_name chromosome_position
#>    <chr>       <chr>                         <int>
#>  1 rs151105710 3                         136125679
#>  2 rs68148663  1                          63153200
#>  3 rs201510740 1                          40686994
#>  4 rs28933094  15                         58855748
#>  5 rs182095422 6                          32669373
#>  6 rs75185853  11                         46751401
#>  7 rs5844832   22                         29455688
#>  8 rs77468106  20                         44275619
#>  9 rs149580368 17                         41874745
#> 10 rs553682607 10                         46060434
#> # … with 2,227 more rows

Hi Jin:

May I close this issue now?

Yes, thank you for providing such a useful and easy to work with package!

You're welcome! Let me know once your analyses are out and published.