Reading Gencode GTF failed

Question

Reading Gencode GTF failed

Snowymint opened this issue a year ago · 1 comments

{
"name": "AttributeError",
"message": "'float' object has no attribute 'split'",
"stack": "---------------------------------------------------------------------------
_RemoteTraceback Traceback (most recent call last)
_RemoteTraceback:
"""
Traceback (most recent call last):
File "/root/miniconda3/envs/ncbi-workflow/lib/python3.10/site-packages/joblib/externals/loky/process_executor.py", line 463, in _process_worker
r = call_item()
File "/root/miniconda3/envs/ncbi-workflow/lib/python3.10/site-packages/joblib/externals/loky/process_executor.py", line 291, in call
return self.fn(*self.args, **self.kwargs)
File "/root/miniconda3/envs/ncbi-workflow/lib/python3.10/site-packages/joblib/parallel.py", line 589, in call
return [func(*args, **kwargs)
File "/root/miniconda3/envs/ncbi-workflow/lib/python3.10/site-packages/joblib/parallel.py", line 589, in
return [func(*args, **kwargs)
File "/root/miniconda3/envs/ncbi-workflow/lib/python3.10/site-packages/genomicranges/io/gtf.py", line 26, in _parse_all_attribute
infos = attr.split(";")
AttributeError: 'float' object has no attribute 'split'
"""

The above exception was the direct cause of the following exception:

AttributeError Traceback (most recent call last)
gr_gencode = genomicranges.read_gtf("/home/n/MA/MA/Milestone_1/blast_db/Homo_sapiens/Gencode_files/gencode.v22.annotation.gtf")

File ~/miniconda3/envs/ncbi-workflow/lib/python3.10/site-packages/genomicranges/io/gtf.py:103, in read_gtf(file)
94 """Read GTF file as :py:class:~genomicranges.GenomicRanges.GenomicRanges.
95
96 Args:
(...)
100 GenomicRanges: Genome annotations from GTF.
101 """
102 compressed = True if file.endswith("gz") else False
--> 103 data = parse_gtf(file, compressed=compressed)
105 return from_pandas(data)

File ~/miniconda3/envs/ncbi-workflow/lib/python3.10/site-packages/genomicranges/io/gtf.py:84, in parse_gtf(path, compressed)
67 else:
68 df = read_csv(
69 path,
70 sep="\t",
(...)
81 ],
82 )
---> 84 rows = Parallel(n_jobs=-2)(
85 delayed(_parse_all_attribute)(row) for _, row in df.iterrows()
86 )
87 gtf = DataFrame.from_records(rows)
88 gtf.drop(["group"], axis=1)

File ~/miniconda3/envs/ncbi-workflow/lib/python3.10/site-packages/joblib/parallel.py:1952, in Parallel.call(self, iterable)
1946 # The first item from the output is blank, but it makes the interpreter
1947 # progress until it enters the Try/Except block of the generator and
1948 # reach the first yield statement. This starts the aynchronous
1949 # dispatch of the tasks to the workers.
1950 next(output)
-> 1952 return output if self.return_generator else list(output)

File ~/miniconda3/envs/ncbi-workflow/lib/python3.10/site-packages/joblib/parallel.py:1595, in Parallel._get_outputs(self, iterator, pre_dispatch)
1592 yield
1594 with self._backend.retrieval_context():
-> 1595 yield from self._retrieve()
1597 except GeneratorExit:
1598 # The generator has been garbage collected before being fully
1599 # consumed. This aborts the remaining tasks if possible and warn
1600 # the user if necessary.
1601 self._exception = True

File ~/miniconda3/envs/ncbi-workflow/lib/python3.10/site-packages/joblib/parallel.py:1699, in Parallel._retrieve(self)
1692 while self._wait_retrieval():
1693
1694 # If the callback thread of a worker has signaled that its task
1695 # triggered an exception, or if the retrieval loop has raised an
1696 # exception (e.g. GeneratorExit), exit the loop and surface the
1697 # worker traceback.
1698 if self._aborting:
-> 1699 self._raise_error_fast()
1700 break
1702 # If the next job is not ready for retrieval yet, we just wait for
1703 # async callbacks to progress.

File ~/miniconda3/envs/ncbi-workflow/lib/python3.10/site-packages/joblib/parallel.py:1734, in Parallel._raise_error_fast(self)
1730 # If this error job exists, immediatly raise the error by
1731 # calling get_result. This job might not exists if abort has been
1732 # called directly or if the generator is gc'ed.
1733 if error_job is not None:
-> 1734 error_job.get_result(self.timeout)

File ~/miniconda3/envs/ncbi-workflow/lib/python3.10/site-packages/joblib/parallel.py:736, in BatchCompletionCallBack.get_result(self, timeout)
730 backend = self.parallel._backend
732 if backend.supports_retrieve_callback:
733 # We assume that the result has already been retrieved by the
734 # callback thread, and is stored internally. It's just waiting to
735 # be returned.
--> 736 return self._return_or_raise()
738 # For other backends, the main thread needs to run the retrieval step.
739 try:

File ~/miniconda3/envs/ncbi-workflow/lib/python3.10/site-packages/joblib/parallel.py:754, in BatchCompletionCallBack._return_or_raise(self)
752 try:
753 if self.status == TASK_ERROR:
--> 754 raise self._result
755 return self._result
756 finally:

AttributeError: 'float' object has no attribute 'split'"
}

Answer 1 · 2023-12-01T00:39:16.000Z

@Snowymint v0.4.1 should work with gencode files. If there are more comment lines in the gtf file, use the skip or comment arguments to modify the reader. Feel free to reach out if you run into more issues!

import genomicranges
gr = genomicranges.read_gtf("./gencode.v22.annotation.gtf.gz")

print(gr)

GenomicRanges with 2563671 ranges and 2563671 metadata columns
        seqnames        ranges           strand    source    feature  score  frame                                    group           gene_id
           <str>     <IRanges> <ndarray[int64]>    <list>     <list> <list> <list>                                   <list>            <list>
      0     chr1 11869 - 14409                + |  HAVANA       gene      .      . gene_id "ENSG00000223972.5"; gene_typ... ENSG00000223972.5
      1     chr1 11869 - 14409                + |  HAVANA transcript      .      . gene_id "ENSG00000223972.5"; transcri... ENSG00000223972.5
      2     chr1 11869 - 12227                + |  HAVANA       exon      .      . gene_id "ENSG00000223972.5"; transcri... ENSG00000223972.5
             ...           ...              ... |     ...        ...    ...    ...                                      ...               ...
2563668     chrM 15956 - 16023                - | ENSEMBL       gene      .      . gene_id "ENSG00000210196.2"; gene_typ... ENSG00000210196.2
2563669     chrM 15956 - 16023                - | ENSEMBL transcript      .      . gene_id "ENSG00000210196.2"; transcri... ENSG00000210196.2
2563670     chrM 15956 - 16023                - | ENSEMBL       exon      .      . gene_id "ENSG00000210196.2"; transcri... ENSG00000210196.2
                                 gene_type gene_status gene_name  level          havana_gene     transcript_id      transcript_type transcript_status
                                    <list>      <list>    <list> <list>               <list>            <list>               <list>            <list>
      0 transcribed_unprocessed_pseudogene       KNOWN   DDX11L1      2 OTTHUMG00000000961.2               nan                  nan               nan
      1 transcribed_unprocessed_pseudogene       KNOWN   DDX11L1      2 OTTHUMG00000000961.2 ENST00000456328.2 processed_transcript             KNOWN
      2 transcribed_unprocessed_pseudogene       KNOWN   DDX11L1      2 OTTHUMG00000000961.2 ENST00000456328.2 processed_transcript             KNOWN
                                       ...         ...       ...    ...                  ...               ...                  ...               ...
2563668                            Mt_tRNA       KNOWN     MT-TP      3                  nan               nan                  nan               nan
2563669                            Mt_tRNA       KNOWN     MT-TP      3                  nan ENST00000387461.2              Mt_tRNA             KNOWN
2563670                            Mt_tRNA       KNOWN     MT-TP      3                  nan ENST00000387461.2              Mt_tRNA             KNOWN
        transcript_name    tag transcript_support_level    havana_transcript exon_number           exon_id    ont protein_id ccdsid
                 <list> <list>                   <list>               <list>      <list>            <list> <list>     <list> <list>
      0             nan    nan                      nan                  nan         nan               nan    nan        nan    nan
      1     DDX11L1-002  basic                        1 OTTHUMT00000362751.1         nan               nan    nan        nan    nan
      2     DDX11L1-002  basic                        1 OTTHUMT00000362751.1           1 ENSE00002234944.1    nan        nan    nan
                    ...    ...                      ...                  ...         ...               ...    ...        ...    ...
2563668             nan    nan                      nan                  nan         nan               nan    nan        nan    nan
2563669       MT-TP-201  basic                       NA                  nan         nan               nan    nan        nan    nan
2563670       MT-TP-201  basic                       NA                  nan           1 ENSE00001544473.2    nan        nan    nan
------
seqinfo(25 sequences): chr1 chr10 chr11 ... chrM chrX chrY