a minimal package for parsing and editing rna secondary structure
To install rna_secstruct
python -m pip install git+https://github.com/jyesselm/rna_secstruct
# load the module, all we need is SecStruct and MotifSearchParams
from rna_secstruct import SecStruct, MotifSearchParams
# initiate SecStruct object with sequence and dot bracket notation
struct = SecStruct("GGGAAACCC", "(((...)))")
Motifs are stored as a dictionary with each given a specific ID. Motifs
struct.motifs
{0: HELIX,GGG&CCC,(((&))), 1: HAIRPIN,GAAAC,(...)}
# can see a formatted version of all motifs with their corresponding ids
print(struct.to_str())
ID: 0, Helix3 GGG&CCC (((&)))
ID: 1, Hairpin3 GAAAC (...)
# can iterate over motifs
for m in struct:
print(m)
HELIX,GGG&CCC,(((&))) HAIRPIN,GAAAC,(...)
# access first motif which is id `0`
struct[0]
HELIX,GGG&CCC,(((&)))
An overview of all the properties that are stored in a motif
m0 = struct[0]
{
# what id is the motif
"m_id" : m0.m_id,
# the type of motif SINGLESTRAND, HELIX, HAIRPIN, JUNCTION
"m_type" : m0.m_type,
# the sequence of the motif, '&' seperates individal strands
"sequence" : m0.sequence,
# the structure in dot bracket notatio
"structure" : m0.structure,
# the position of each nucleotide in relation the full sequence/structure
"strands" : m0.strands,
# all the positions of in a single list
"positions" : m0.positions,
# the first position a motif contains
"start_pos" : m0.start_pos,
# the last position a motif contains
"end_pos" : m0.end_pos,
# the children nodes of this motif
"children" : m0.children
}
{'m_id': 0,
'm_type': 'HELIX',
'sequence': 'GGG&CCC',
'structure': '(((&)))',
'strands': [[0, 1, 2], [6, 7, 8]],
'positions': [0, 1, 2, 6, 7, 8],
'start_pos': 0,
'end_pos': 8,
'children': [HAIRPIN,GAAAC,(...)]}
# is this a motif a child of another motif. Can access with m0.parent.
m0.has_parent()
False
# are there nodes under this one. Helices and junctions will almost always have children
m0.has_children()
True
m0.is_single_strand()
False
m0.is_hairpin()
False
m0.is_junction()
False
m0.is_helix()
True
# recursive functions call all children to get the full sequence and structure of this node and all
# children
m0.recursive_sequence(), m0.recursive_structure()
('GGGAAACCC', '(((...)))')
struct = SecStruct("GGAAACGAAACGAAACC", "((...)(...)(...))")
# getting all helices.
# there are similare get_single_strand(), get_junctions(), get_hairpins()
struct.get_helices()
[HELIX,G&C,(&), HELIX,G&C,(&), HELIX,G&C,(&), HELIX,G&C,(&)]
# getting a copy. Always a good idea if you are going to change the structure
# see later sections on design
struct_copy = struct.get_copy()
# getting a sub structure
struct = SecStruct("GGGACCUUCGGGACCC", "(((.((....)).)))")
# if I want remove the bottom helix I can simply do
sub_struct = struct.get_sub_structure(1)
# this gets all nodes from 1 and all its children
sub_struct
GACCUUCGGGAC, (.((....)).)
# can search for motifs using get_motifs function with search parameters
struct = SecStruct("GGGACCUUCGGGACCC", "(((.((....)).)))")
msp = MotifSearchParams(sequence="GAC&GAC")
struct.get_motifs(msp)
[JUNCTION,GAC&GAC,(.(&).)]
msp
MotifSearchParams(sequence='GAC&GAC', structure=None, m_type=None, min_pos=0, max_pos=999, min_id=0, max_id=999)
You can search by sequence, structure, type of motifs, the miminal or maximum nucleotide position or the min and max id. These constraints can be useful if you wish to exclude common sequences on the 5' or 3' ends of a sequence of interest
seq = (
"GGAAGAUCGAGUAGAUCAAAGAGCCUAUGGCUGCCACCCGAGCCCUUGAACUACAGGGAACACUGGAAA"
"CAGUACCCCCUGCAAGGGCGUUUGACGGUGGCAGCCUAAGGGCUCAAAGAAACAACAACAACAAC"
)
ss = (
"....((((.....))))...((((((..((((((((((((((((((((.....(((((...((((....)"
")))...))))))))))))..)))..))))))))))...))))))...................."
)
struct = SecStruct(seq, ss)
msp = MotifSearchParams(m_type="JUNCTION", min_pos=50)
struct.get_motifs(msp)
[JUNCTION,GAACA&UACCC,(...(&)...)]
msp = MotifSearchParams(structure="(....)")
struct.get_motifs(msp)
[HAIRPIN,GGAAAC,(....)]
# it is also possible to search using motif "tokens" which are the indentifiers generated
# for each motif. For example "Helix4" is any helix of length 4
struct.get_motifs_by_token("Helix4")
[HELIX,GAUC&GAUC,((((&)))), HELIX,ACUG&CAGU,((((&))))]
# get a two way junction (Junction2) which has 5 unpaired nucleotides on the first
# strand and 0 on the other
struct.get_motifs_by_token("Junction2_5|0")
[JUNCTION,GAACUAC&GC,(.....(&))]
# using change_motif we can change the sequence or structure of a given motif
# here is a trival example of changing the sequence of the helix which has the id=0
struct = SecStruct("GGGAAACCC", "(((...)))")
struct.change_motif(0, "AGG&CCU", "(((&)))")
struct.sequence, struct.structure
('AGGAAACCU', '(((...)))')
# here we can change the loop sequence from a tetraloop to a hexaloop
struct.change_motif(1, "CUUUUUUG", "(......)")
struct.sequence, struct.structure
('AGCUUUUUUGCU', '(((......)))')
# can also make larger changes, by replacing a motif with a segment composed
# of many motifs, the object will reparse and create new motifs
struct = SecStruct("GGGAAACCC", "(((...)))")
print("original motifs:")
print(struct.to_str())
print("\nnew motifs")
struct.change_motif(1, "GGGACCUUCGGGACCC", "(((.((....)).)))")
print(struct.to_str())
original motifs: ID: 0, Helix3 GGG&CCC (((&))) ID: 1, Hairpin3 GAAAC (...)
new motifs ID: 0, Helix5 GGGGG&CCCCC (((((&))))) ID: 1, Junction2_1|1 GAC&GAC (.(&).) ID: 2, Helix2 CC&GG ((&)) ID: 3, Hairpin4 CUUCGG (....)