import os
os.chdir("C:/Users/manso/OneDrive - University of West London/MSc Bioinformatics - UWL/3.DSB - Data Science for Bioinformatics/Practice/DSB W4")
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
from Bio.Seq import Seq
from Bio import SeqFeature
from Bio.SeqUtils import GC
A_seq = Seq("GCTTCTTTTAGCCTTCACCTATTCAACAACAGGGGTGAGCTGCTTCTGTCC")
print("First letter:", A_seq[0])
First letter: G
print("Third letter:", A_seq[2])
Third letter: T
print("Last letter:", A_seq[-1])
Last letter: C
100 * float (A_seq.count("G") + A_seq.count("C"))/ len(A_seq)
49.01960784313726
print("GC content:", GC(A_seq))
GC content: 49.01960784313726
slice1 = A_seq [4:12]
print("My slice:", slice1)
My slice: CTTTTAGC
We have just taken the section between position 4 to 12 from our sequence above.
An extended slicing feature available is to be able to extract every "nth" position specified in the string. This is achieved using a double colon "::" eg. to extract every third position from position 0 would be done as follows:
print("Every 3rd position from 0:", A_seq[0::3])
Every 3rd position from 0: GTTACCCTAAAGGCCCT
A_seq[2::4] #Every third position from position 2
Seq('TTGTCTCAGGCTC')
A_seq[::-1]
Seq('CCTGTCTTCGTCGAGTGGGGACAACAACTTATCCACTTCCGATTTTCTTCG')
A_string = str(A_seq)
print("Seq string:", A_string)
Seq string: GCTTCTTTTAGCCTTCACCTATTCAACAACAGGGGTGAGCTGCTTCTGTCC
piece1 = Seq("GCTTC")
piece2 = Seq("GCCTTC")
combined = piece1 + piece2
print (combined)
GCTTCGCCTTC
list_of_seqs = [Seq("TTCACC"), Seq("GGGT"), Seq("TGTCC")]
concatenated = Seq("")
for s in list_of_seqs :
concatenated += s
print("Concatenated sequences:", concatenated)
Concatenated sequences: TTCACCGGGTTGTCC
The .join method available for Python strings can also be used for Biopython Seq objects.
contigs = [Seq("GGGTG"), Seq("ATGTCC"), Seq("TCACA")]
spacer = Seq("L"*10)
print(spacer.join(contigs))
GGGTGLLLLLLLLLLATGTCCLLLLLLLLLLTCACA
d_seq = Seq("acgtATCT")
print (d_seq)
acgtATCT
d_seq.upper()
Seq('ACGTATCT')
d_seq.lower()
Seq('acgtatct')
"ACGT" in d_seq
False
"ACGT" in d_seq.upper()
True
To import and view the codon tables, the Bio.Data package codon table can be used.
from Bio.Data import CodonTable
standard_table = CodonTable.unambiguous_dna_by_name["Standard"]
mito_table = CodonTable.unambiguous_dna_by_name["Vertebrate Mitochondrial"]
print(standard_table)
print(mito_table)
Table 1 Standard, SGC0 | T | C | A | G | --+---------+---------+---------+---------+-- T | TTT F | TCT S | TAT Y | TGT C | T T | TTC F | TCC S | TAC Y | TGC C | C T | TTA L | TCA S | TAA Stop| TGA Stop| A T | TTG L(s)| TCG S | TAG Stop| TGG W | G --+---------+---------+---------+---------+-- C | CTT L | CCT P | CAT H | CGT R | T C | CTC L | CCC P | CAC H | CGC R | C C | CTA L | CCA P | CAA Q | CGA R | A C | CTG L(s)| CCG P | CAG Q | CGG R | G --+---------+---------+---------+---------+-- A | ATT I | ACT T | AAT N | AGT S | T A | ATC I | ACC T | AAC N | AGC S | C A | ATA I | ACA T | AAA K | AGA R | A A | ATG M(s)| ACG T | AAG K | AGG R | G --+---------+---------+---------+---------+-- G | GTT V | GCT A | GAT D | GGT G | T G | GTC V | GCC A | GAC D | GGC G | C G | GTA V | GCA A | GAA E | GGA G | A G | GTG V | GCG A | GAG E | GGG G | G --+---------+---------+---------+---------+-- Table 2 Vertebrate Mitochondrial, SGC1 | T | C | A | G | --+---------+---------+---------+---------+-- T | TTT F | TCT S | TAT Y | TGT C | T T | TTC F | TCC S | TAC Y | TGC C | C T | TTA L | TCA S | TAA Stop| TGA W | A T | TTG L | TCG S | TAG Stop| TGG W | G --+---------+---------+---------+---------+-- C | CTT L | CCT P | CAT H | CGT R | T C | CTC L | CCC P | CAC H | CGC R | C C | CTA L | CCA P | CAA Q | CGA R | A C | CTG L | CCG P | CAG Q | CGG R | G --+---------+---------+---------+---------+-- A | ATT I(s)| ACT T | AAT N | AGT S | T A | ATC I(s)| ACC T | AAC N | AGC S | C A | ATA M(s)| ACA T | AAA K | AGA Stop| A A | ATG M(s)| ACG T | AAG K | AGG Stop| G --+---------+---------+---------+---------+-- G | GTT V | GCT A | GAT D | GGT G | T G | GTC V | GCC A | GAC D | GGC G | C G | GTA V | GCA A | GAA E | GGA G | A G | GTG V(s)| GCG A | GAG E | GGG G | G --+---------+---------+---------+---------+--
Sequences can be compared to assess if they match
seq1 = Seq("ACGT")
"ACGT" == seq1
True
The Seq object is “read only” (immutable) like other Python string. The benefit of this is that you want to ensure you are not changing your sequence data during analysis.
You will get an erorr if you try an alter your Seq:
A_seq[5] = "G"
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-25-21c7da8dd0ce> in <module> ----> 1 A_seq[5] = "G" TypeError: 'Seq' object does not support item assignment
A_string = str(A_seq)
print("Seq string", A_string)
Seq string GCTTCTTTTAGCCTTCACCTATTCAACAACAGGGGTGAGCTGCTTCTGTCC
from Bio.Seq import MutableSeq
mutable_seq = MutableSeq(A_string)
mutable_seq
MutableSeq('GCTTCTTTTAGCCTTCACCTATTCAACAACAGGGGTGAGCTGCTTCTGTCC')
mutable_seq = MutableSeq(A_string)
mutable_seq
MutableSeq('GCTTCTTTTAGCCTTCACCTATTCAACAACAGGGGTGAGCTGCTTCTGTCC')
mutable_seq = MutableSeq('GCTTCTTTTAGCCTTCACCTATTCAACAACAGGGGTGAGCTGCTTCTGTCC')
mutable_seq
MutableSeq('GCTTCTTTTAGCCTTCACCTATTCAACAACAGGGGTGAGCTGCTTCTGTCC')
mutable_seq[3] = "C"
mutable_seq
MutableSeq('GCTCCTTTTAGCCTTCACCTATTCAACAACAGGGGTGAGCTGCTTCTGTCC')
mutable_seq.remove("T")
mutable_seq
MutableSeq('GCCCTTTTAGCCTTCACCTATTCAACAACAGGGGTGAGCTGCTTCTGTCC')
mutable_seq.reverse()
mutable_seq
MutableSeq('CCTGTCTTCGTCGAGTGGGGACAACAACTTATCCACTTCCGATTTTCCCG')
There is an UnknownSeq object that is a subclass of the basic Seq object, and its purpose is to represent a sequence where we know the length, but not the individual letters.
The Seq object could be used in this scenario, but it would waste a lot of memory to hold a million "N" characters when it could be stored as a single letter "N" and the desired length as an integer.
from Bio.Seq import UnknownSeq
unk = UnknownSeq(20)
print("unknown seq:", unk)
print("unknown seq length", len(unk))
unknown seq: ???????????????????? unknown seq length 20
C:\ProgramData\Anaconda3\lib\site-packages\Bio\Seq.py:2008: BiopythonDeprecationWarning: UnknownSeq(length) is deprecated; please use Seq(None, length) instead. warnings.warn(
unk_dna = UnknownSeq(20, character = "N")
print (unk_dna)
NNNNNNNNNNNNNNNNNNNN