Reading Gencode GTF failed
Snowymint opened this issue · 1 comments
{
"name": "AttributeError",
"message": "'float' object has no attribute 'split'",
"stack": "---------------------------------------------------------------------------
_RemoteTraceback Traceback (most recent call last)
_RemoteTraceback:
"""
Traceback (most recent call last):
File "/root/miniconda3/envs/ncbi-workflow/lib/python3.10/site-packages/joblib/externals/loky/process_executor.py", line 463, in _process_worker
r = call_item()
File "/root/miniconda3/envs/ncbi-workflow/lib/python3.10/site-packages/joblib/externals/loky/process_executor.py", line 291, in call
return self.fn(*self.args, **self.kwargs)
File "/root/miniconda3/envs/ncbi-workflow/lib/python3.10/site-packages/joblib/parallel.py", line 589, in call
return [func(*args, **kwargs)
File "/root/miniconda3/envs/ncbi-workflow/lib/python3.10/site-packages/joblib/parallel.py", line 589, in
return [func(*args, **kwargs)
File "/root/miniconda3/envs/ncbi-workflow/lib/python3.10/site-packages/genomicranges/io/gtf.py", line 26, in _parse_all_attribute
infos = attr.split(";")
AttributeError: 'float' object has no attribute 'split'
"""
The above exception was the direct cause of the following exception:
AttributeError Traceback (most recent call last)
gr_gencode = genomicranges.read_gtf("/home/n/MA/MA/Milestone_1/blast_db/Homo_sapiens/Gencode_files/gencode.v22.annotation.gtf")
File ~/miniconda3/envs/ncbi-workflow/lib/python3.10/site-packages/genomicranges/io/gtf.py:103, in read_gtf(file)
94 """Read GTF file as :py:class:~genomicranges.GenomicRanges.GenomicRanges
.
95
96 Args:
(...)
100 GenomicRanges: Genome annotations from GTF.
101 """
102 compressed = True if file.endswith("gz") else False
--> 103 data = parse_gtf(file, compressed=compressed)
105 return from_pandas(data)
File ~/miniconda3/envs/ncbi-workflow/lib/python3.10/site-packages/genomicranges/io/gtf.py:84, in parse_gtf(path, compressed)
67 else:
68 df = read_csv(
69 path,
70 sep="\t",
(...)
81 ],
82 )
---> 84 rows = Parallel(n_jobs=-2)(
85 delayed(_parse_all_attribute)(row) for _, row in df.iterrows()
86 )
87 gtf = DataFrame.from_records(rows)
88 gtf.drop(["group"], axis=1)
File ~/miniconda3/envs/ncbi-workflow/lib/python3.10/site-packages/joblib/parallel.py:1952, in Parallel.call(self, iterable)
1946 # The first item from the output is blank, but it makes the interpreter
1947 # progress until it enters the Try/Except block of the generator and
1948 # reach the first yield
statement. This starts the aynchronous
1949 # dispatch of the tasks to the workers.
1950 next(output)
-> 1952 return output if self.return_generator else list(output)
File ~/miniconda3/envs/ncbi-workflow/lib/python3.10/site-packages/joblib/parallel.py:1595, in Parallel._get_outputs(self, iterator, pre_dispatch)
1592 yield
1594 with self._backend.retrieval_context():
-> 1595 yield from self._retrieve()
1597 except GeneratorExit:
1598 # The generator has been garbage collected before being fully
1599 # consumed. This aborts the remaining tasks if possible and warn
1600 # the user if necessary.
1601 self._exception = True
File ~/miniconda3/envs/ncbi-workflow/lib/python3.10/site-packages/joblib/parallel.py:1699, in Parallel._retrieve(self)
1692 while self._wait_retrieval():
1693
1694 # If the callback thread of a worker has signaled that its task
1695 # triggered an exception, or if the retrieval loop has raised an
1696 # exception (e.g. GeneratorExit
), exit the loop and surface the
1697 # worker traceback.
1698 if self._aborting:
-> 1699 self._raise_error_fast()
1700 break
1702 # If the next job is not ready for retrieval yet, we just wait for
1703 # async callbacks to progress.
File ~/miniconda3/envs/ncbi-workflow/lib/python3.10/site-packages/joblib/parallel.py:1734, in Parallel._raise_error_fast(self)
1730 # If this error job exists, immediatly raise the error by
1731 # calling get_result. This job might not exists if abort has been
1732 # called directly or if the generator is gc'ed.
1733 if error_job is not None:
-> 1734 error_job.get_result(self.timeout)
File ~/miniconda3/envs/ncbi-workflow/lib/python3.10/site-packages/joblib/parallel.py:736, in BatchCompletionCallBack.get_result(self, timeout)
730 backend = self.parallel._backend
732 if backend.supports_retrieve_callback:
733 # We assume that the result has already been retrieved by the
734 # callback thread, and is stored internally. It's just waiting to
735 # be returned.
--> 736 return self._return_or_raise()
738 # For other backends, the main thread needs to run the retrieval step.
739 try:
File ~/miniconda3/envs/ncbi-workflow/lib/python3.10/site-packages/joblib/parallel.py:754, in BatchCompletionCallBack._return_or_raise(self)
752 try:
753 if self.status == TASK_ERROR:
--> 754 raise self._result
755 return self._result
756 finally:
AttributeError: 'float' object has no attribute 'split'"
}
@Snowymint v0.4.1 should work with gencode files. If there are more comment lines in the gtf file, use the skip or comment arguments to modify the reader. Feel free to reach out if you run into more issues!
import genomicranges
gr = genomicranges.read_gtf("./gencode.v22.annotation.gtf.gz")
print(gr)
GenomicRanges with 2563671 ranges and 2563671 metadata columns
seqnames ranges strand source feature score frame group gene_id
<str> <IRanges> <ndarray[int64]> <list> <list> <list> <list> <list> <list>
0 chr1 11869 - 14409 + | HAVANA gene . . gene_id "ENSG00000223972.5"; gene_typ... ENSG00000223972.5
1 chr1 11869 - 14409 + | HAVANA transcript . . gene_id "ENSG00000223972.5"; transcri... ENSG00000223972.5
2 chr1 11869 - 12227 + | HAVANA exon . . gene_id "ENSG00000223972.5"; transcri... ENSG00000223972.5
... ... ... | ... ... ... ... ... ...
2563668 chrM 15956 - 16023 - | ENSEMBL gene . . gene_id "ENSG00000210196.2"; gene_typ... ENSG00000210196.2
2563669 chrM 15956 - 16023 - | ENSEMBL transcript . . gene_id "ENSG00000210196.2"; transcri... ENSG00000210196.2
2563670 chrM 15956 - 16023 - | ENSEMBL exon . . gene_id "ENSG00000210196.2"; transcri... ENSG00000210196.2
gene_type gene_status gene_name level havana_gene transcript_id transcript_type transcript_status
<list> <list> <list> <list> <list> <list> <list> <list>
0 transcribed_unprocessed_pseudogene KNOWN DDX11L1 2 OTTHUMG00000000961.2 nan nan nan
1 transcribed_unprocessed_pseudogene KNOWN DDX11L1 2 OTTHUMG00000000961.2 ENST00000456328.2 processed_transcript KNOWN
2 transcribed_unprocessed_pseudogene KNOWN DDX11L1 2 OTTHUMG00000000961.2 ENST00000456328.2 processed_transcript KNOWN
... ... ... ... ... ... ... ...
2563668 Mt_tRNA KNOWN MT-TP 3 nan nan nan nan
2563669 Mt_tRNA KNOWN MT-TP 3 nan ENST00000387461.2 Mt_tRNA KNOWN
2563670 Mt_tRNA KNOWN MT-TP 3 nan ENST00000387461.2 Mt_tRNA KNOWN
transcript_name tag transcript_support_level havana_transcript exon_number exon_id ont protein_id ccdsid
<list> <list> <list> <list> <list> <list> <list> <list> <list>
0 nan nan nan nan nan nan nan nan nan
1 DDX11L1-002 basic 1 OTTHUMT00000362751.1 nan nan nan nan nan
2 DDX11L1-002 basic 1 OTTHUMT00000362751.1 1 ENSE00002234944.1 nan nan nan
... ... ... ... ... ... ... ... ...
2563668 nan nan nan nan nan nan nan nan nan
2563669 MT-TP-201 basic NA nan nan nan nan nan nan
2563670 MT-TP-201 basic NA nan 1 ENSE00001544473.2 nan nan nan
------
seqinfo(25 sequences): chr1 chr10 chr11 ... chrM chrX chrY