BioJulia/GeneticVariation.jl

cannot read vcf from Sanger

pfarndt opened this issue · 1 comments

I just try to parse a vcf file recently downloaded from the Sanger Institute, the first lines read:

##fileformat=VCFv4.1
##source=COSMICv91
##reference=GRCh38
##fileDate=20200324
##comment="Missing nucleotide details indicate ambiguity during curation process"
##comment="URL stub for ID field (use numeric portion of ID)='https://cancer.sanger.ac.uk/cosmic/ncv/overview?genome=38&id='"
##comment="REF and ALT sequences are both forward strand
##INFO=<ID=GENE,Number=1,Type=String,Description="Gene name">
##INFO=<ID=STRAND,Number=1,Type=String,Description="Gene strand">
##INFO=<ID=GENOMIC_ID,Number=1,Type=String,Description="Genomic Mutation ID">
##INFO=<ID=LEGACY_ID,Number=1,Type=String,Description="Legacy Mutation ID">
##INFO=<ID=CDS,Number=1,Type=String,Description="CDS annotation">
##INFO=<ID=AA,Number=1,Type=String,Description="Peptide annotation">
##INFO=<ID=HGVSC,Number=1,Type=String,Description="HGVS cds syntax">
##INFO=<ID=HGVSP,Number=1,Type=String,Description="HGVS peptide syntax">
##INFO=<ID=HGVSG,Number=1,Type=String,Description="HGVS genomic syntax">
##INFO=<ID=CNT,Number=1,Type=Integer,Description="How many samples have this mutation">
##INFO=<ID=SNP,Number=0,Type=Flag,Description="classified as SNP">
#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO
1       10108   COSV70831266    C       T       .       .       GENE=DDX11L1;STRAND=+;LEGACY_ID=COSN28762392;CNT=1
1       10108   COSV70831266    C       T       .       .       GENE=DDX11L1_ENST00000450305;STRAND=+;LEGACY_ID=COSN28762392;CNT=1
1       10108   COSV70831266    C       T       .       .       GENE=WASH7P;STRAND=-;LEGACY_ID=COSN28762392;CNT=1
1       10151   COSV70830383    T       A       .       .       GENE=DDX11L1_ENST00000450305;STRAND=+;LEGACY_ID=COSN14661299;CNT=1
1       10151   COSV70830383    T       A       .       .       GENE=WASH7P;STRAND=-;LEGACY_ID=COSN14661299;CNT=1
1       10151   COSV70830383    T       A       .       .       GENE=DDX11L1;STRAND=+;LEGACY_ID=COSN14661299;CNT=1

I am not able to read this file since this code:

stream = open("test.vcf", "r")
reader = VCF.Reader(stream)

for record in reader
    @show record
#     println(VCF.info(record))
end
close(reader)

throws an error

AssertionError: endpos != 0

Stacktrace:
 [1] infovalrange at /home//.julia/packages/GeneticVariation/r8DAL/src/vcf/record.jl:527 [inlined]
 [2] info(::GeneticVariation.VCF.Record) at /home//.julia/packages/GeneticVariation/r8DAL/src/vcf/record.jl:466

Is this reader compliant to the fileformat=VCFv4.1?

Same problem with Illumina's VCF, the issue is that infovalrange expect the INFO field to be terminated by a ; or a \t. I had a quick look at the specifications but I'm not sure what's correct. What do you think @benjward ?

https://samtools.github.io/hts-specs/VCFv4.2.pdf

Anyway @pfarndt, you can patch things with :

@eval GeneticVariation.VCF begin 

    find_enpos(k, data, key) = something(findnext(isequal(k), data, last(key)+1), 0)

    # Returns the data range of the `i`-th value.
    function infovalrange(record::Record, i::Int)
        checkfilled(record)
        data = record.data
        key = record.infokey[i]
        if last(key) + 1 ≤ lastindex(data) && data[last(key)+1] == UInt8('=')
            endpos =  find_enpos(0x3B, data, key)# 0x3B is byte equivalent of char ';'.
            if endpos == 0
                endpos = find_enpos(0x09, data, key)# 0x09 is byte equivalent of char '\t'
                endpos = endpos == 0 ? lastindex(data) : endpos# no separator found, use the whole line
            end
            return last(key)+2:endpos-1
        else
            return last(key)+1:last(key)
        end
    end

end