Missing documentation regarding background and color schemes (failure to generate seqlogo using ambiguous DNA)
blaiseli opened this issue · 0 comments
Describe the bug
The documentation lacks examples of how to generate seqlogos when ambiguous nucleotides are present.
I have sequences made of A, C, G, T and N, for which I computed frequency values in a pandas data frame:
>>> freqs
A C G T N
position
1 0.319161 0.245934 0.218972 0.215933 0.000000
2 0.358897 0.173274 0.202930 0.264629 0.000270
3 0.264792 0.241902 0.214400 0.278905 0.000001
4 0.290173 0.194422 0.303271 0.212132 0.000002
5 0.329992 0.246832 0.182880 0.240293 0.000002
6 0.279256 0.235955 0.219061 0.265727 0.000001
7 0.278154 0.202007 0.314267 0.205572 0.000000
8 0.287437 0.231733 0.225736 0.255093 0.000001
9 0.267679 0.239583 0.227086 0.265649 0.000002
10 0.296314 0.211029 0.305474 0.187179 0.000004
11 0.341542 0.226820 0.208675 0.222963 0.000000
12 0.302927 0.220687 0.236567 0.239816 0.000003
13 0.250451 0.210791 0.304559 0.234198 0.000000
14 0.372832 0.231344 0.193433 0.202391 0.000000
15 0.284385 0.069736 0.282652 0.363227 0.000000
16 0.217580 0.201708 0.354618 0.226093 0.000000
17 0.365647 0.212575 0.189495 0.232283 0.000000
18 0.294462 0.080645 0.279089 0.345804 0.000000
19 0.259917 0.238632 0.280305 0.221141 0.000004
20 0.287875 0.240443 0.221388 0.250294 0.000000
21 0.275017 0.217983 0.227088 0.279912 0.000000
22 0.293021 0.210776 0.292389 0.203810 0.000004
23 0.329954 0.232754 0.196362 0.240930 0.000000
24 0.295759 0.209505 0.216758 0.277978 0.000000
25 0.309162 0.196959 0.316347 0.177531 0.000000
26 0.327185 0.222645 0.232940 0.217229 0.000000
27 0.286854 0.226503 0.225384 0.261259 0.000000
28 0.250997 0.244785 0.257392 0.246827 0.000000
29 0.284841 0.243132 0.234195 0.237833 0.000000
I haven't found in the documentation how to have this data accepted by seqlogo.Ppm
.
By exploring the source code and experimenting, I finally found two ways to do it.
- Using a custom 5-letters alphabet:
>>> logo5 = seqlogo.Ppm(freqs, alphabet_type="custom", alphabet="ACGTN", background=[0.25, 0.25, 0.25, 0.25, 0.0])
- Adding an extra "-" column to match the pre-defined 6-letters "reduced DNA" alphabet:
>>> freqs["-"] = np.zeros(29)
>>> logo6 = seqlogo.Ppm(freqs, alphabet_type="reduced DNA", background=[0.25, 0.25, 0.25, 0.25, 0.0, 0.0])
But I'm not sure to understand exactly what I should put in background
.
Then, I failed to use seqlogo.seqlogo
. In particular, I feel there should be some information in the documentation about how to specify colour schemes.
To Reproduce
Some attempts with the custom alphabet:
>>> seqlogo.seqlogo(logo5, filename="/tmp/logo5.pdf", format="pdf")
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-28-b1c1bb251418> in <module>
----> 1 seqlogo.seqlogo(logo5, filename="/tmp/logo5.pdf", format="pdf")
~/src/bioinfo_utils/.venv/lib/python3.8/site-packages/seqlogo/seqlogo.py in seqlogo(pm, ic_scale, color_scheme, size, format, filename, **kwargs)
70 raise ValueError('{} color_scheme selected is not an allowed amino acid color scheme'.format(color_scheme))
71
---> 72 color_scheme = wl.std_color_schemes[color_scheme]
73
74 # Setup the format writer
KeyError: None
>>> seqlogo.seqlogo(logo5, filename="/tmp/logo5.pdf", color_scheme="classic", format="pdf")
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
~/src/bioinfo_utils/.venv/lib/python3.8/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
2890 try:
-> 2891 return self._engine.get_loc(casted_key)
2892 except KeyError as err:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
KeyError: 0
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
<ipython-input-29-37d517bbcb2a> in <module>
----> 1 seqlogo.seqlogo(logo5, filename="/tmp/logo5.pdf", color_scheme="classic", format="pdf")
~/src/bioinfo_utils/.venv/lib/python3.8/site-packages/seqlogo/seqlogo.py in seqlogo(pm, ic_scale, color_scheme, size, format, filename, **kwargs)
89 logo_format = wl.LogoFormat(pm, options)
90
---> 91 out = out_format(pm, logo_format)
92
93 # Create the file if the user supplied an filename
~/src/bioinfo_utils/.venv/lib/python3.8/site-packages/weblogo/logo_formatter.py in pdf_formatter(logodata, logoformat)
34 def pdf_formatter(logodata: LogoData, logoformat: LogoFormat) -> bytes:
35 """ Generate a logo in PDF format."""
---> 36 eps = eps_formatter(logodata, logoformat).decode()
37 gs = GhostscriptAPI()
38 return gs.convert('pdf', eps, logoformat.logo_width, logoformat.logo_height)
~/src/bioinfo_utils/.venv/lib/python3.8/site-packages/weblogo/logo_formatter.py in eps_formatter(logodata, logoformat)
162
163 if conv_factor:
--> 164 stack_height = logodata.entropy[seq_index] * std_units[logoformat.unit_name]
165 else:
166 stack_height = 1.0 # probability # pragma: no cover
~/src/bioinfo_utils/.venv/lib/python3.8/site-packages/pandas/core/series.py in __getitem__(self, key)
880
881 elif key_is_scalar:
--> 882 return self._get_value(key)
883
884 if (
~/src/bioinfo_utils/.venv/lib/python3.8/site-packages/pandas/core/series.py in _get_value(self, label, takeable)
989
990 # Similar to Index.get_value, but we do not fall back to positional
--> 991 loc = self.index.get_loc(label)
992 return self.index._get_values_for_loc(self, loc, label)
993
~/src/bioinfo_utils/.venv/lib/python3.8/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
2891 return self._engine.get_loc(casted_key)
2892 except KeyError as err:
-> 2893 raise KeyError(key) from err
2894
2895 if tolerance is not None:
KeyError: 0
Some attempts with the "reduced DNA" alphabet:
>>> seqlogo.seqlogo(logo6, filename="/tmp/logo6.pdf", format="pdf")
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
~/src/bioinfo_utils/.venv/lib/python3.8/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
2890 try:
-> 2891 return self._engine.get_loc(casted_key)
2892 except KeyError as err:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
KeyError: 0
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
<ipython-input-30-0a0df93d7b64> in <module>
----> 1 seqlogo.seqlogo(logo6, filename="/tmp/logo6.pdf", format="pdf")
~/src/bioinfo_utils/.venv/lib/python3.8/site-packages/seqlogo/seqlogo.py in seqlogo(pm, ic_scale, color_scheme, size, format, filename, **kwargs)
89 logo_format = wl.LogoFormat(pm, options)
90
---> 91 out = out_format(pm, logo_format)
92
93 # Create the file if the user supplied an filename
~/src/bioinfo_utils/.venv/lib/python3.8/site-packages/weblogo/logo_formatter.py in pdf_formatter(logodata, logoformat)
34 def pdf_formatter(logodata: LogoData, logoformat: LogoFormat) -> bytes:
35 """ Generate a logo in PDF format."""
---> 36 eps = eps_formatter(logodata, logoformat).decode()
37 gs = GhostscriptAPI()
38 return gs.convert('pdf', eps, logoformat.logo_width, logoformat.logo_height)
~/src/bioinfo_utils/.venv/lib/python3.8/site-packages/weblogo/logo_formatter.py in eps_formatter(logodata, logoformat)
162
163 if conv_factor:
--> 164 stack_height = logodata.entropy[seq_index] * std_units[logoformat.unit_name]
165 else:
166 stack_height = 1.0 # probability # pragma: no cover
~/src/bioinfo_utils/.venv/lib/python3.8/site-packages/pandas/core/series.py in __getitem__(self, key)
880
881 elif key_is_scalar:
--> 882 return self._get_value(key)
883
884 if (
~/src/bioinfo_utils/.venv/lib/python3.8/site-packages/pandas/core/series.py in _get_value(self, label, takeable)
989
990 # Similar to Index.get_value, but we do not fall back to positional
--> 991 loc = self.index.get_loc(label)
992 return self.index._get_values_for_loc(self, loc, label)
993
~/src/bioinfo_utils/.venv/lib/python3.8/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
2891 return self._engine.get_loc(casted_key)
2892 except KeyError as err:
-> 2893 raise KeyError(key) from err
2894
2895 if tolerance is not None:
KeyError: 0
>>> seqlogo.seqlogo(logo6, filename="/tmp/logo6.pdf", color_scheme="classic", format="pdf")
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
~/src/bioinfo_utils/.venv/lib/python3.8/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
2890 try:
-> 2891 return self._engine.get_loc(casted_key)
2892 except KeyError as err:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
KeyError: 0
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
<ipython-input-31-4fd2890af7d4> in <module>
----> 1 seqlogo.seqlogo(logo6, filename="/tmp/logo6.pdf", color_scheme="classic", format="pdf")
~/src/bioinfo_utils/.venv/lib/python3.8/site-packages/seqlogo/seqlogo.py in seqlogo(pm, ic_scale, color_scheme, size, format, filename, **kwargs)
89 logo_format = wl.LogoFormat(pm, options)
90
---> 91 out = out_format(pm, logo_format)
92
93 # Create the file if the user supplied an filename
~/src/bioinfo_utils/.venv/lib/python3.8/site-packages/weblogo/logo_formatter.py in pdf_formatter(logodata, logoformat)
34 def pdf_formatter(logodata: LogoData, logoformat: LogoFormat) -> bytes:
35 """ Generate a logo in PDF format."""
---> 36 eps = eps_formatter(logodata, logoformat).decode()
37 gs = GhostscriptAPI()
38 return gs.convert('pdf', eps, logoformat.logo_width, logoformat.logo_height)
~/src/bioinfo_utils/.venv/lib/python3.8/site-packages/weblogo/logo_formatter.py in eps_formatter(logodata, logoformat)
162
163 if conv_factor:
--> 164 stack_height = logodata.entropy[seq_index] * std_units[logoformat.unit_name]
165 else:
166 stack_height = 1.0 # probability # pragma: no cover
~/src/bioinfo_utils/.venv/lib/python3.8/site-packages/pandas/core/series.py in __getitem__(self, key)
880
881 elif key_is_scalar:
--> 882 return self._get_value(key)
883
884 if (
~/src/bioinfo_utils/.venv/lib/python3.8/site-packages/pandas/core/series.py in _get_value(self, label, takeable)
989
990 # Similar to Index.get_value, but we do not fall back to positional
--> 991 loc = self.index.get_loc(label)
992 return self.index._get_values_for_loc(self, loc, label)
993
~/src/bioinfo_utils/.venv/lib/python3.8/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
2891 return self._engine.get_loc(casted_key)
2892 except KeyError as err:
-> 2893 raise KeyError(key) from err
2894
2895 if tolerance is not None:
KeyError: 0
Expected behavior
The documentation should explain how to deal with data containing ambiguous DNA, in particular regarding the "background" and "color_scheme" arguments, ideally including examples.
Desktop (please complete the following information):
- OS: MX Linux 17 (debian-based)
- Python Version: 3.8.2
- NumPy Version: 1.19.2
- Pandas Version: 1.1.2
- matplotlib Version: 3.3.2
Additional context
I'm working using an ssh connection without X forwarding, using ipython (I replaced the prompts with standard >>>
prompt in the code examples above).