ohsu-comp-bio/g2p-aggregator

PMKB harvester incorrect alternate allele representation

Opened this issue · 7 comments

Harvested features set coordinates to genomic but ref and alt to transcript (coding DNA). This has the effect of creating incorrect ref and alt for negative strand genes.

Bug is here:

https://github.com/ohsu-comp-bio/g2p-aggregator/blob/v0.10/harvester/pmkb.py#L57-L64

Thanks @ahwagner
A detailed example below. What should the expected results for ref and alt be? (sorry if this an obvious question)

example pmkb input

{"tumor": [{"id": 96, "name": "Papillary Carcinoma"}], "tissues": [{"id": 54, "name": "Thyroid"}], "variant": [{"amino_acid_change": "V600E", "germline": null, "partner_gene": null, "description": null, "exons": "15", "cnv_type": null, "notes": null, "cosmic": "476", "effect": null, "coordinates": "7:140453136-140453136", "description_type": "HGVS", "cytoband": null, "variant_type": "missense", "dna_change": "1799T>A", "codons": "600", "chromosome_based_cnv": false, "gene": {"description": null, "created_at": "2015-12-09T20:08:27.000Z", "updated_at": "2017-08-24T22:00:40.000Z", "active_ind": true, "external_id": "157764", "id": 33, "name": "BRAF"}, "transcript": "ENST00000288602", "id": 112, "chromosome": null, "name": "BRAF V600E"}, {"amino_acid_change": null, "germline": null, "partner_gene": null, "description": null, "exons": "15", "cnv_type": null, "notes": null, "cosmic": null, "effect": null, "coordinates": "7:140453135-140453137", "description_type": "codon", "cytoband": null, "variant_type": "any", "dna_change": null, "codons": "600", "chromosome_based_cnv": false, "gene": {"description": null, "created_at": "2015-12-09T20:08:27.000Z", "updated_at": "2017-08-24T22:00:40.000Z", "active_ind": true, "external_id": "157764", "id": 33, "name": "BRAF"}, "transcript": "ENST00000288602", "id": 206, "chromosome": null, "name": "BRAF codon(s) 600 any"}]}

applicable fields

 jq '.variant[] | [.dna_change, .coordinates ]'
[
  "1799T>A",
  "7:140453136-140453136"
]
[
  null,
  "7:140453135-140453137"
]

normalization results

jq '.features[] | [.start, .end, .ref, .alt ]'
[
  140453136,
  140453136,
  "T",
  "A"
]
[
  140453135,
  140453137,
  null,
  null
]

resulting association

{
  "features": [
    {
      "provenance_rule": "from_source",
      "end": 140453136,
      "name": "BRAF V600E",
      "links": [
        "http://reg.genome.network/refseq/RS000031",
        "http://reg.genome.network/refseq/RS000055",
        "http://myvariant.info/v1/variant/chr7:g.140453136A>T?assembly=hg19",
        "http://gnomad.broadinstitute.org/variant/7-140453136-A-T",
        "http://myvariant.info/v1/variant/chr7:g.140753336A>T?assembly=hg38",
        "http://reg.genome.network/allele/CA123643",
        "http://reg.genome.network/refseq/RS000007",
        "http://www.ncbi.nlm.nih.gov/snp/113488022",
        "http://www.ncbi.nlm.nih.gov/clinvar/variation/13961",
        "http://www.ncbi.nlm.nih.gov/clinvar/?term=29000[alleleid]",
        "http://cancer.sanger.ac.uk/cosmic/mutation/overview?id=476",
        "http://exac.broadinstitute.org/variant/7-140453136-A-T",
        "http://reg.genome.network/refseq/RS000682"
      ],
      "sequence_ontology": {
        "hierarchy": [
          "SO:0001060",
          "SO:0001537",
          "SO:0001878",
          "SO:0001564",
          "SO:0001576",
          "SO:0001791",
          "SO:0001580",
          "SO:0001818",
          "SO:0001650",
          "SO:0001992"
        ],
        "soid": "SO:0001583",
        "root_name": "sequence_variant",
        "name": "missense_variant",
        "root_soid": "SO:0001060"
      },
      "provenance": [
        "http://reg.genome.network/allele?hgvs=NC_000007.13%3Ag.140453136A%3ET"
      ],
      "start": 140453136,
      "synonyms": [
        "NG_007873.3:g.176429T>A",
        "NC_000007.13:g.140453136A>T",
        "CM000669.1:g.140453136A>T",
        "chr7:g.140753336A>T",
        "NC_000007.14:g.140753336A>T",
        "LRG_299:g.176429T>A",
        "COSM476",
        "NC_000007.12:g.140099605A>T",
        "chr7:g.140453136A>T",
        "CM000669.2:g.140753336A>T",
        "7-140453136-A-T"
      ],
      "biomarker_type": "missense",
      "referenceName": "GRCh37",
      "geneSymbol": "BRAF",
      "alt": "A",
      "attributes": {
        "amino_acid_change": {
          "string_value": "V600E"
        },
        "germline": {
          "string_value": null
        },
        "partner_gene": {
          "string_value": null
        },
        "description": {
          "string_value": null
        },
        "exons": {
          "string_value": "15"
        },
        "notes": {
          "string_value": null
        },
        "cosmic": {
          "string_value": "476"
        },
        "effect": {
          "string_value": null
        },
        "cnv_type": {
          "string_value": null
        },
        "description_type": {
          "string_value": "HGVS"
        },
        "cytoband": {
          "string_value": null
        },
        "variant_type": {
          "string_value": "missense"
        },
        "dna_change": {
          "string_value": "1799T>A"
        },
        "codons": {
          "string_value": "600"
        },
        "chromosome_based_cnv": {
          "string_value": false
        },
        "transcript": {
          "string_value": "ENST00000288602"
        },
        "id": {
          "string_value": 112
        },
        "chromosome": {
          "string_value": null
        }
      },
      "ref": "T",
      "chromosome": "7"
    },
    {
      "provenance_rule": "from_source",
      "end": 140453137,
      "name": "BRAF codon(s) 600 any",
      "sequence_ontology": {
        "soid": "",
        "root_name": "Uncategorized",
        "name": "Uncategorized",
        "root_soid": ""
      },
      "start": 140453135,
      "biomarker_type": "any",
      "referenceName": "GRCh37",
      "geneSymbol": "BRAF",
      "attributes": {
        "amino_acid_change": {
          "string_value": null
        },
        "germline": {
          "string_value": null
        },
        "partner_gene": {
          "string_value": null
        },
        "description": {
          "string_value": null
        },
        "exons": {
          "string_value": "15"
        },
        "notes": {
          "string_value": null
        },
        "cosmic": {
          "string_value": null
        },
        "effect": {
          "string_value": null
        },
        "cnv_type": {
          "string_value": null
        },
        "description_type": {
          "string_value": "codon"
        },
        "cytoband": {
          "string_value": null
        },
        "variant_type": {
          "string_value": "any"
        },
        "dna_change": {
          "string_value": null
        },
        "codons": {
          "string_value": "600"
        },
        "chromosome_based_cnv": {
          "string_value": false
        },
        "transcript": {
          "string_value": "ENST00000288602"
        },
        "id": {
          "string_value": 206
        },
        "chromosome": {
          "string_value": null
        }
      },
      "chromosome": "7"
    }
  ],
  "tags": [],
  "genes": [
    "BRAF"
  ],
  "gene_identifiers": [
    {
      "symbol": "BRAF",
      "entrez_id": "673",
      "ensembl_gene_id": "ENSG00000157764"
    }
  ],
  "source_url": "https://pmkb.weill.cornell.edu/therapies/101",
  "source": "pmkb",
  "pmkb": "{\"tumor\": [{\"id\": 96, \"name\": \"Papillary Carcinoma\"}], \"tissues\": [{\"id\": 54, \"name\": \"Thyroid\"}], \"variant\": [{\"amino_acid_change\": \"V600E\", \"germline\": null, \"partner_gene\": null, \"description\": null, \"exons\": \"15\", \"cnv_type\": null, \"notes\": null, \"cosmic\": \"476\", \"effect\": null, \"coordinates\": \"7:140453136-140453136\", \"description_type\": \"HGVS\", \"cytoband\": null, \"variant_type\": \"missense\", \"dna_change\": \"1799T>A\", \"codons\": \"600\", \"chromosome_based_cnv\": false, \"gene\": {\"description\": null, \"created_at\": \"2015-12-09T20:08:27.000Z\", \"updated_at\": \"2017-08-24T22:00:40.000Z\", \"active_ind\": true, \"external_id\": \"157764\", \"id\": 33, \"name\": \"BRAF\"}, \"transcript\": \"ENST00000288602\", \"id\": 112, \"chromosome\": null, \"name\": \"BRAF V600E\"}, {\"amino_acid_change\": null, \"germline\": null, \"partner_gene\": null, \"description\": null, \"exons\": \"15\", \"cnv_type\": null, \"notes\": null, \"cosmic\": null, \"effect\": null, \"coordinates\": \"7:140453135-140453137\", \"description_type\": \"codon\", \"cytoband\": null, \"variant_type\": \"any\", \"dna_change\": null, \"codons\": \"600\", \"chromosome_based_cnv\": false, \"gene\": {\"description\": null, \"created_at\": \"2015-12-09T20:08:27.000Z\", \"updated_at\": \"2017-08-24T22:00:40.000Z\", \"active_ind\": true, \"external_id\": \"157764\", \"id\": 33, \"name\": \"BRAF\"}, \"transcript\": \"ENST00000288602\", \"id\": 206, \"chromosome\": null, \"name\": \"BRAF codon(s) 600 any\"}]}",
  "dev_tags": [
    "no-so"
  ],
  "feature_names": [
    "BRAF BRAF V600E",
    "BRAF BRAF codon(s) 600 any"
  ],
  "association": {
    "drug_labels": "NA",
    "description": "Eighty percent of all thyroid cancers are papillary thyroid carcinomas (PTCs). BRAF is part of the mitogen-activated protein kinase (MAPK) signaling pathway and V600E is an activating mutation of BRAF.  The BRAF V600E mutation has been reported in 45% of patients with papillary thyroid carcinoma. The BRAF V600E-like PTC's (BVL) and the RAS-like PTC (RL-PTC) are fundamentally different in their genomic, epigenomic, and proteomic profiles. Presence of a BRAF p.Val600Glu (V600E) mutation is highly specific for papillary thyroid carcinoma and is only rarely associated with the follicular variant PTC , other well-differentiated thyroid neoplasms or nodular goiters. The possible prognostic impact of BRAF V600E mutations in papillary carcinoma of the thyroid continues to be studied. ",
    "publication_url": "",
    "source_link": "https://pmkb.weill.cornell.edu/therapies/101",
    "phenotypes": [
      {
        "source": "http://purl.obolibrary.org/obo/DOID_3113",
        "term": "papillary carcinoma",
        "id": "DOID:3113",
        "family": "carcinoma",
        "description": "papillary carcinoma"
      }
    ],
    "evidence": [
      {
        "info": {
          "publications": []
        },
        "evidenceType": {
          "sourceName": "pmkb"
        },
        "description": "1"
      }
    ],
    "evidence_label": "A",
    "response_type": "NA",
    "evidence_level": 1
  }
}

This is almost entirely correct. However, the issue starts with the normalization results. Here, we have:

jq '.features[] | [.start, .end, .ref, .alt ]'
[
  140453136,
  140453136,
  "T",
  "A"
], ...

when it should instead be:

jq '.features[] | [.start, .end, .ref, .alt ]'
[
  140453136,
  140453136,
  "A",
  "T"
], ...

Because ref and alt propagate (instead of being overwritten at normalization), the record is correct where normalized, but incorrect specifically in the ref and alt fields. The reason for this error is that the specified ref and alt (pulled from the dna_change field) refer to a CDS change, not a gDNA change.

To fix this, we can either overwrite ref and alt at normalization (less ideal), or translate the transcript change to a genomic change (more ideal).

@ahwagner Thanks.
Given https://en.wikipedia.org/wiki/Complementary_DNA is it too simplistic to simply swap them?

diff --git a/harvester/pmkb.py b/harvester/pmkb.py
index 2506483..d2f8991 100644
--- a/harvester/pmkb.py
+++ b/harvester/pmkb.py
@@ -59,9 +59,11 @@ def convert(interpretation):
             if dna_change and '>' in dna_change:
                 prefix, alt = dna_change.split('>')
                 ref = prefix[-len(alt):]
+                # the specified ref and alt (pulled from the dna_change field)
+                # refer to a CDS change, not a gDNA change.
                 if len(ref) > 0 and len(alt) > 0:
-                    feature['ref'] = ref
-                    feature['alt'] = alt
+                    feature['ref'] = alt
+                    feature['alt'] = ref

         if attributes['amino_acid_change']['string_value']:
            variant_name.append(attributes['amino_acid_change']['string_value'])

This is too simplistic, unfortunately. For instance, instead of a simple SNV, it may be an insdel (such as V600D, CDS change: 1799_1800TG>AT), or it may be a + strand gene (where ref and alt remain unchanged).

Hmm. The calculating the complement is straightforward enough, but how to reliably determine the strand of the gene (without transcript)?

It looks like we do have a transcript ID from every PMKB association:

association['features'][0]['attributes']['transcript']['string_value']

We could use that plus the gene symbol (e.g. BRAF) to get the relevant info from biomart:

wget -q -O - 'http://grch37.ensembl.org/biomart/martservice?query=<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE Query>
<Query  virtualSchemaName = "default" formatter = "CSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" >
<Dataset name = "hsapiens_gene_ensembl" interface = "default" >
<Filter name = "hgnc_symbol" value = "BRAF"/>
<Attribute name = "ensembl_transcript_id" />
<Attribute name = "strand" />
</Dataset>
</Query>' | grep 'ENST00000497784'

A value of -1 is a minus strand gene, which would need ref and alt reverse complemented

Reverse complement is simply:

COMPLEMENT = {
    'A': 'T',
    'T': 'A',
    'G': 'C',
    'C': 'G',
    '-': '-'
}

complement_map = str.maketrans(COMPLEMENT)

ref = ref[::-1].translate(complement_map)
alt = alt[::-1].translate(complement_map)

{
"provenance_rule": "from_source",
"end": 140453136,
"name": "BRAF V600E",
"links": [
"http://reg.genome.network/refseq/RS000031",
"http://reg.genome.network/refseq/RS000055",
"http://myvariant.info/v1/variant/chr7:g.140453136A>T?assembly=hg19",
"http://gnomad.broadinstitute.org/variant/7-140453136-A-T",
"http://myvariant.info/v1/variant/chr7:g.140753336A>T?assembly=hg38",
"http://reg.genome.network/allele/CA123643",
"http://reg.genome.network/refseq/RS000007",
"http://www.ncbi.nlm.nih.gov/snp/113488022",
"http://www.ncbi.nlm.nih.gov/clinvar/variation/13961",
"http://www.ncbi.nlm.nih.gov/clinvar/?term=29000[alleleid]",
"http://cancer.sanger.ac.uk/cosmic/mutation/overview?id=476",
"http://exac.broadinstitute.org/variant/7-140453136-A-T",
"http://reg.genome.network/refseq/RS000682"
],
"sequence_ontology": {
"hierarchy": [
"SO:0001060",
"SO:0001537",
"SO:0001878",
"SO:0001564",
"SO:0001576",
"SO:0001791",
"SO:0001580",
"SO:0001818",
"SO:0001650",
"SO:0001992"
],
"soid": "SO:0001583",
"root_name": "sequence_variant",
"name": "missense_variant",
"root_soid": "SO:0001060"
},
"provenance": [
"http://reg.genome.network/allele?hgvs=NC_000007.13%3Ag.140453136A%3ET"
],
"start": 140453136,
"synonyms": [
"NG_007873.3:g.176429T>A",
"NC_000007.13:g.140453136A>T",
"CM000669.1:g.140453136A>T",
"chr7:g.140753336A>T",
"NC_000007.14:g.140753336A>T",
"LRG_299:g.176429T>A",
"COSM476",
"NC_000007.12:g.140099605A>T",
"chr7:g.140453136A>T",
"CM000669.2:g.140753336A>T",
"7-140453136-A-T"
],
"biomarker_type": "missense",
"referenceName": "GRCh37",
"geneSymbol": "BRAF",
"alt": "A",
"attributes": {
"amino_acid_change": {
"string_value": "V600E"
},
"germline": {
"string_value": null
},
"partner_gene": {
"string_value": null
},
"description": {
"string_value": null
},
"exons": {
"string_value": "15"
},
"notes": {
"string_value": null
},
"cosmic": {
"string_value": "476"
},
"effect": {
"string_value": null
},
"cnv_type": {
"string_value": null
},
"description_type": {
"string_value": "HGVS"
},
"cytoband": {
"string_value": null
},
"variant_type": {
"string_value": "missense"
},
"dna_change": {
"string_value": "1799T>A"
},
"codons": {
"string_value": "600"
},
"chromosome_based_cnv": {
"string_value": false
},
"transcript": {
"string_value": "ENST00000288602"
},
"id": {
"string_value": 112
},
"chromosome": {
"string_value": null
}
},
"ref": "T",
"chromosome": "7"
},