import os
os.chdir('C:/Users/manso/OneDrive - University of West London/MSc Bioinformatics - UWL/3.DSB - Data Science for Bioinformatics/Practice/DSB W7')
import gzip
import glob
from Bio import SeqIO
from Bio import Entrez
from Bio import ExPASy
from Bio.Seq import Seq
from io import StringIO
from Bio.SeqRecord import SeqRecord
from Bio.SeqIO.FastaIO import SimpleFastaParser
Both require specification of the file type, eg. "fasta", "genbank".
for seq_record in SeqIO.parse("MultiFasta.fasta", "fasta"):
print(seq_record.id)
print(repr(seq_record.seq))
print(len(seq_record))
Fasta1 Seq('TCCAAGTCCCTGCAGCTTCTTGACTACACCGTCCTTTCTCACCGAGCAGAC') 51 Fasta2 Seq('CTCTCTGCAGCTGCAGCCTGCACCCACAGAGGACTTCTGGCAGAGGTATAA') 51 Fasta3 Seq('TTCCTACCCGGCTCGGCGCTCCCGGCCCTGGCTGGGCAGGGCAGAGCGCCC') 51 Fasta4 Seq('CCGCTGAGACAGCAGGCCGGCGCTGCCCGCAGGTGTCGGCGGCAGCGGCGG') 51 Fasta5 Seq('GCTAGTTGTTTTCTGCTTGTTTACACGTTTTCATGCAGATTTTAACTGTAT') 51 Fasta6 Seq('CACTGGTGACGACCAGTCCCCCACCACAGGGTTCTGCCGCCAGTGACTGGA') 51
record = SeqIO.read("NC_005816.fna", "fasta")
print(record)
ID: gi|45478711|ref|NC_005816.1| Name: gi|45478711|ref|NC_005816.1| Description: gi|45478711|ref|NC_005816.1| Yersinia pestis biovar Microtus str. 91001 plasmid pPCP1, complete sequence Number of features: 0 Seq('TGTAACGAACGGTGCAATAGTGATCCACACCCAACGCCTGAAATCAGATCCAGG...CTG')
identifiers = [seq_record.id for seq_record in SeqIO.parse("NC_005816.gb", "genbank")]
print(identifiers)
['NC_005816.1']
In the previous examples, a loop was used to iterate over all the records one by one.
The next() function can also be used to step through each record in an iterator:
record_iterator = SeqIO.parse("MultiFasta.fasta", "fasta")
first_record = next(record_iterator)
print(first_record.id)
second_record = next(record_iterator)
print(second_record.id)
Fasta1 Fasta2
The next() function is useful for obtaining only the first record in a multi-record file, resulting in the rest of the records in the file being ignored.
recordOne = next(SeqIO.parse("MultiFasta.fasta", "fasta"))
print(recordOne)
ID: Fasta1 Name: Fasta1 Description: Fasta1 Number of features: 0 Seq('TCCAAGTCCCTGCAGCTTCTTGACTACACCGTCCTTTCTCACCGAGCAGAC')
Bio.SeqIO.parse() gives you a SeqRecord iterator, and you get the records one by one.
Often you need to be able to access the records in any order. The Python list data type is perfect for this, and we can turn the record iterator into a list of SeqRecord objects using the built-in Python function list() like so:
records = list(SeqIO.parse("test1.fastq", "fastq"))
print("Found %i records" % len(records))
print("The last record")
last_record = records[-1] # using Python's list tricks
print(last_record.id)
print(repr(last_record.seq))
print(len(last_record))
print("The first record")
first_record = records[0] # remember, Python counts from zero
print(first_record.id)
print(repr(first_record.seq))
print(len(first_record))
Found 50000 records The last record SRR5680996.26785565 Seq('TGGCACAGCAGGCGGTTCTGACTGATGTGCACACAGTAAACAAAATGCTTG') 51 The first record SRR5680996.31734339 Seq('AGAGCAACACCTTGTGCCTCCAAGAAAGTATTAGTCTCCCTGAGGACTCTT') 51
rec_iterator = SeqIO.parse("NC_005816.gb", "genbank")
record1 = next(rec_iterator)
print(record1)
ID: NC_005816.1 Name: NC_005816 Description: Yersinia pestis biovar Microtus str. 91001 plasmid pPCP1, complete sequence Database cross-references: Project:58037 Number of features: 41 /molecule_type=DNA /topology=circular /data_file_division=BCT /date=21-JUL-2008 /accessions=['NC_005816'] /sequence_version=1 /gi=45478711 /keywords=[''] /source=Yersinia pestis biovar Microtus str. 91001 /organism=Yersinia pestis biovar Microtus str. 91001 /taxonomy=['Bacteria', 'Proteobacteria', 'Gammaproteobacteria', 'Enterobacteriales', 'Enterobacteriaceae', 'Yersinia'] /references=[Reference(title='Genetics of metabolic variations between Yersinia pestis biovars and the proposal of a new biovar, microtus', ...), Reference(title='Complete genome sequence of Yersinia pestis strain 91001, an isolate avirulent to humans', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...)] /comment=PROVISIONAL REFSEQ: This record has not yet been subject to final NCBI review. The reference sequence was derived from AE017046. COMPLETENESS: full length. Seq('TGTAACGAACGGTGCAATAGTGATCCACACCCAACGCCTGAAATCAGATCCAGG...CTG')
print(record1.annotations)
print(record1.annotations.keys())
print(record1.annotations.values())
{'molecule_type': 'DNA', 'topology': 'circular', 'data_file_division': 'BCT', 'date': '21-JUL-2008', 'accessions': ['NC_005816'], 'sequence_version': 1, 'gi': '45478711', 'keywords': [''], 'source': 'Yersinia pestis biovar Microtus str. 91001', 'organism': 'Yersinia pestis biovar Microtus str. 91001', 'taxonomy': ['Bacteria', 'Proteobacteria', 'Gammaproteobacteria', 'Enterobacteriales', 'Enterobacteriaceae', 'Yersinia'], 'references': [Reference(title='Genetics of metabolic variations between Yersinia pestis biovars and the proposal of a new biovar, microtus', ...), Reference(title='Complete genome sequence of Yersinia pestis strain 91001, an isolate avirulent to humans', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...)], 'comment': 'PROVISIONAL REFSEQ: This record has not yet been subject to final\nNCBI review. The reference sequence was derived from AE017046.\nCOMPLETENESS: full length.'} dict_keys(['molecule_type', 'topology', 'data_file_division', 'date', 'accessions', 'sequence_version', 'gi', 'keywords', 'source', 'organism', 'taxonomy', 'references', 'comment']) dict_values(['DNA', 'circular', 'BCT', '21-JUL-2008', ['NC_005816'], 1, '45478711', [''], 'Yersinia pestis biovar Microtus str. 91001', 'Yersinia pestis biovar Microtus str. 91001', ['Bacteria', 'Proteobacteria', 'Gammaproteobacteria', 'Enterobacteriales', 'Enterobacteriaceae', 'Yersinia'], [Reference(title='Genetics of metabolic variations between Yersinia pestis biovars and the proposal of a new biovar, microtus', ...), Reference(title='Complete genome sequence of Yersinia pestis strain 91001, an isolate avirulent to humans', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...)], 'PROVISIONAL REFSEQ: This record has not yet been subject to final\nNCBI review. The reference sequence was derived from AE017046.\nCOMPLETENESS: full length.'])
all_species = []
for seq_record in SeqIO.parse("NC_005816.fna", "fasta"):
all_species.append(seq_record.description.split()[1])
print(all_species)
['Yersinia']
record_iterator = SeqIO.parse("MultiFasta.fasta", "fasta")
FaRec1 = next(record_iterator)
print(FaRec1.id)
Fasta1
FaRec1.id = "new_id"
print(FaRec1.id)
new_id
FaRec1.description = FaRec1.id + " " + "desired new description"
print(FaRec1.format("fasta")[:50])
>new_id desired new description TCCAAGTCCCTGCAGCTT
Instead of using a filename to parse a sequence data from a file, you can give Bio.SeqIO a handle. In this section we’ll use handles to parse sequence from compressed files.
print(sum(len(r) for r in SeqIO.parse("NC_005816.gb", "genbank")))
9609
Here we use a file handle instead, using the with statement to close the handle automatically:
with open("NC_005816.gb") as handle:
print(sum(len(r) for r in SeqIO.parse(handle, "gb")))
9609
with gzip.open("NC_005816.gb.gz", "rt") as handle:
print(sum(len(r) for r in SeqIO.parse(handle, "gb")))
9609
Now we will use a different handle, an internet network connection, to use Entrez EFetch for downloading and parsing sequences from NCBI.
with Entrez.efetch(
db="nucleotide", rettype="fasta", retmode="text", id="1677537813") as handle:
seq_entrez = SeqIO.read(handle, "fasta")
print("%s with %i features" % (seq_entrez.id, len(seq_entrez.features)))
C:\ProgramData\Anaconda3\lib\site-packages\Bio\Entrez\__init__.py:658: UserWarning: Email address is not specified. To make use of NCBI's E-utilities, NCBI requires you to specify your email address with each request. As an example, if your email address is A.N.Other@example.com, you can specify it as follows: from Bio import Entrez Entrez.email = 'A.N.Other@example.com' In case of excessive usage of the E-utilities, NCBI will attempt to contact a user at the email address provided before blocking access to the E-utilities. warnings.warn(
NM_001272072.2 with 0 features
The file can be obtained from NCBI as a GenBank file:
with Entrez.efetch(
db="nucleotide", rettype="gb", retmode="text", id="1677537813") as handle:
seq_gb = SeqIO.read(handle, "gb")
print("%s with %i features" % (seq_gb.id, len(seq_gb.features)))
NM_001272072.2 with 12 features
If we use the parse() function instead of read() we can fetch multiple records.
with Entrez.efetch(
db="nucleotide", rettype="gb", retmode="text", id="1677537813,932245638,239051903"
) as handle:
for seq_rec in SeqIO.parse(handle, "gb"):
print("%s %s..." % (seq_rec.id, seq_rec.description[:50]))
print(
"Sequence length %i, %i features, from: %s"
% (
len(seq_rec),
len(seq_rec.features),
seq_rec.annotations["source"],
)
)
NM_001272072.2 Homo sapiens amine oxidase copper containing 1 (AO... Sequence length 2666, 12 features, from: Homo sapiens (human) KP300003.1 Gossypium hirsutum cultivar CCRI10 AOC1 (AOC1) mRN... Sequence length 762, 3 features, from: Gossypium hirsutum (cotton) NM_001161622.1 Mus musculus amine oxidase, copper-containing 1 (A... Sequence length 2734, 10 features, from: Mus musculus (house mouse)
Using a Genbank file (AOC1seq.gb) containing sequence data for AOC1 from P. pastoris (a yeast), S. miltiorrhiza (red sage) and G hirsutum (cotton) index the records and access them like a database using the Python dictionary data type:
AOC1_dict = SeqIO.to_dict(SeqIO.parse("AOC1seq.gb", "genbank"))
list(AOC1_dict.keys())
['AF358434.1', 'JF682784.1', 'KP300003.1']
list(AOC1_dict.values())
[SeqRecord(seq=Seq('TTCAAGGTTTTCGTTCTCATAGATAAGAGTTCCACGTGACCATTTGTATGGGGA...TTC'), id='AF358434.1', name='AF358434', description='Pichia pastoris lysyl oxidase (AOC1) gene, complete cds', dbxrefs=[]), SeqRecord(seq=Seq('AGAGAGAAGAAAAAGGTGAATCAATTAATTAGCCATGGCAGCTTCATCCGCCTC...AAT'), id='JF682784.1', name='JF682784', description='Salvia miltiorrhiza allene oxide cyclase (AOC1) gene, complete cds', dbxrefs=[]), SeqRecord(seq=Seq('ATGGCATCCACCACCTCCCTCAAACCGATCCCTTCCATCAACTTGCCTTCCCAA...TGA'), id='KP300003.1', name='KP300003', description='Gossypium hirsutum cultivar CCRI10 AOC1 (AOC1) mRNA, complete cds', dbxrefs=[])]
AOC1_record = AOC1_dict["JF682784.1"]
print("description:\n", AOC1_record.description)
print("sequence:\n", AOC1_record.seq)
description: Salvia miltiorrhiza allene oxide cyclase (AOC1) gene, complete cds sequence: AGAGAGAAGAAAAAGGTGAATCAATTAATTAGCCATGGCAGCTTCATCCGCCTCTACTATTTTGAAGGTTGCCGTCTCCTCCCCTTCTCCCGCCAGACTGCCGCCGTCCGCCGCCTCCCAGAAACTCCCATCCAAGCAGAAACACTCCCTCCCTAAACCCCTCGCTCTCTCCACCACTAAATCATTCTCATGCAGAGCACAGGCTGCAGCTGATTCAACACCCCGTCCCCGTAAGCTTTGCTTTTCTTCCATTTCAGATTCCTGATTTATTTTTTCTAAGAAATGAGAAATATTTGTTGTTCTAATTTAGAGAAAGTTCAAGAGCTGCACGTCTACGAGATCAACGAGCGTGATCGCGGCAGCCCCGCATACCTCCGATTGAGCCAAAAAACCGTCAATTCCCTCGGCGATCTCGTGCCTTTCAGCAACAAGGTACAATTAAATCTAATCTTTTTAATTATCACACTGCCCTGCCCTAATTATTTAAATGTGCGAAAATCATGATCGGCCGTGGATGTGGATTTGCAGGTGTACACCGGCGACCTGAAGAAACGGGCTGGAATAACGGCGGGGATATGCATCCTGATAAAGAACGAAGCAGAGAAGAAGGGCGACCGGTACGAGGCCATCTACAGCTTCTACTTGGGCGACTACGGCCACATCGCCGTGCAGGGGCCCTACCTCACCTACCAAGACACCGAGCTCGCCGTCACCGGCGGCTCCGGCGTCTTCGAGGGCGTCTACGGCCACGTCAAGCTCCACCAGATCATCTTCCCCTTCAAACTCTTCTATACCTTCCACCTCAAGGGCATCCCCGACCTGCCGCCGGAGCTGCTCGCCCAGCCGGTGCCGCCCGCGCTCCACGTGGAGCCCACCCCCGCCGCCAAGACTTGTGAACCGGGAGCCACGCTCCCTAACTTTACCAATTAGTTTCTCCGTTGACGGGTCGTTGCTTCCATGTGTAATAAAT
AOC1_FaDict = SeqIO.to_dict(SeqIO.parse("AOC1seq.fasta", "fasta"))
print(AOC1_FaDict.keys())
dict_keys(['gi|13936869|gb|AF358434.1|', 'gi|377552570|gb|JF682784.1|', 'gi|932245638|gb|KP300003.1|'])
Bio.SeqIO.to_dict() is very flexible but as it keeps all the data in memory, it limits the file size that can be worked with based on your computer’s RAM.
When working with larger files, use Bio.SeqIO.index(), which still returns a dictionary like object, and does not keep everything in memory. But rather records where each record is within the file and returns the specific record when indicated.
AOC1_index = SeqIO.index("AOC1seq.gb", "genbank")
print(len(AOC1_index))
3
AOC1_index.keys()
KeysView(SeqIO.index('AOC1seq.gb', 'genbank', alphabet=None, key_function=None))
AOC1FA_INDEX = SeqIO.index("AOC1seq.fasta", "fasta")
print(len(AOC1FA_INDEX))
3
list(AOC1FA_INDEX)
['gi|13936869|gb|AF358434.1|', 'gi|377552570|gb|JF682784.1|', 'gi|932245638|gb|KP300003.1|']
Print the FASTA sequence of a specificed indexed record (gi|932245638|gb|KP300003.1|):
print(AOC1FA_INDEX["gi|932245638|gb|KP300003.1|"].format("fasta"))
>gi|932245638|gb|KP300003.1| Gossypium hirsutum cultivar CCRI10 AOC1 (AOC1) mRNA, complete cds ATGGCATCCACCACCTCCCTCAAACCGATCCCTTCCATCAACTTGCCTTCCCAATCTCAC AGAGCTTTAGCTTCCAATTTCTCTTATTCAAAGCCTTTTCCCTTCCATGGCCTCAACCTC TCATCGGTCACTGAAACCAGTTCCTTCACCTCATCAAGATCCTCCAACCCCTTCACTACC ACTGCCTTCTTCTTCAACAAGTTCAAGCAGGAGGCCGCTCCACATACTCCCAAGCCCACA AAAGTTCAAGAGCTACACGTTTATGAAATGAACGAGAGAGACAGAAGCAGCCCTGCAGTT TTGAAACTAAGCCAAAAACCCGTAAACTCTCTGGGTGATCTGGTTCCTTTCACTAACAAG CTCTACTCTGGAGACTTGCAGAAAAGGGTGGGCATCACGGCTGGACTCTGTGTGCTGATC CAACACGTCCCGGAGAAAAAAGGCGATCGCTATGAAGCCATATACAGCTTCTACTTTGGT GACTACGGCCATTTGTCTGTCCAGGGACCCTATTTAACGTATGAGGATTCCTACTTGGCG GTGACGGGAGGATCTGGGATTTTCGAAGGAGCCTACGGACAGGTGAAGTTACATCAGATA GTGTTTCCCATGAAGCTGTATTATACATTCTACCTGAAAGGGATAGGTGATTTGCCAGCT GAGCTTCTAGGAAAGCCAGTGCCACCATCGCCTGCGGTGGAGCCCAGCGCGGCTGCTAAA GCTACGGAGCCCCATGGTAGTATTCCAAATTTTACCAACTGA
The dictionary-like object from Bio.SeqIO.index() gives you each entry as a SeqRecord object. However, it is sometimes useful to be able to get the original raw data straight from the file. For this use the get_raw() method which takes a single argument (the record identifier) and returns a bytes string (extracted from the file without modification).
A motivating example is extracting a subset of a records from a large file where either Bio.SeqIO.write() does not (yet) support the output file format (e.g. the plain text SwissProt file format) or where you need to preserve the text exactly (e.g. GenBank or EMBL output from Biopython does not yet preserve every last bit of annotation).
Let’s suppose you have download the whole of UniProt in the plain text SwissPort file format from their FTP site and uncompressed it as the file uniprot_sprot.dat, and you want to extract just a few records from it:
uniprot = SeqIO.index("uniprot_sprot.dat", "swiss")
with open("selected.dat", "wb") as out_handle:
for acc in ["P33487", "P19801", "P13689", "Q8JZQ5", "Q9TRC7"]:
print(out_handle.write(uniprot.get_raw(acc)))
26216 30205 10852 11032 13662
files = glob.glob("gbvrl*.seq")
print("%i files to index" % len(files))
4 files to index
gb_vrl = SeqIO.index_db("gbvrl.idx", files, "genbank")
print("%i sequences indexed" % len(gb_vrl))
453791 sequences indexed
print(gb_vrl["AB811634.1"].description)
Equine encephalosis virus NS3 gene, complete cds, isolate: Kimron1
print(gb_vrl.get_raw("AB811634.1"))
b'LOCUS AB811634 723 bp RNA linear VRL 17-JUN-2015\nDEFINITION Equine encephalosis virus NS3 gene, complete cds, isolate: Kimron1.\nACCESSION AB811634\nVERSION AB811634.1\nKEYWORDS .\nSOURCE Equine encephalosis virus\n ORGANISM Equine encephalosis virus\n Viruses; Riboviria; Orthornavirae; Duplornaviricota;\n Resentoviricetes; Reovirales; Reoviridae; Sedoreovirinae;\n Orbivirus.\nREFERENCE 1\n AUTHORS Westcott,D., Mildenberg,Z., Bellaiche,M., McGowan,S.L.,\n Grierson,S.S., Choudhury,B. and Steinbach,F.\n TITLE Evidence for the circulation of equine encephalosis virus in Israel\n since 2001\n JOURNAL PLoS ONE 8 (8), E70532 (2013)\n PUBMED 23950952\n REMARK DOI:10.1371/journal.pone.0070532\n Erratum:[PLoS One. 2013;8(9).\n doi:10.1371/annotation/4875ab92-466a-4f5f-b9c7-bc0e168a8f9b.\n Wescott, David G [corrected to Westcott, David G]]\n Publication Status: Online-Only\nREFERENCE 2 (bases 1 to 723)\n AUTHORS Westcott,D., Mildenberg,Z., Bellaiche,M., Mcgowan,S.L.,\n Grierson,S., Choudhury,B. and Steinbach,F.\n TITLE Direct Submission\n JOURNAL Submitted (29-MAR-2013) Contact:Bhudipa Choudhury Animal and Plant\n Health Agency (APHA), Virology; Woodham Lane, Addlestone, Surrey\n KT15-3NB, U.K\nFEATURES Location/Qualifiers\n source 1..723\n /organism="Equine encephalosis virus"\n /mol_type="genomic RNA"\n /isolate="Kimron1"\n /host="Equus caballus"\n /db_xref="taxon:201490"\n /country="Israel"\n gene 1..723\n /gene="NS3"\n CDS 1..723\n /gene="NS3"\n /codon_start=1\n /product="NS3 protein"\n /protein_id="BAN78512.1"\n /translation="MYPVLSRAVVGNPEERALMVYPPTAPMPPVTTWDNLKIDSVDGM\n KDLALNILDKNITSTTGADECDKREKAMFASVAESAADSPMVRTIKIQIYNRVLDDME\n REKRKCEKRRAMLRFVSNAFITLMLMSTFLMAMMQTPPITQYVEKACNGTGGTETNDP\n CGLMRWSGAVQFLMLIMSGFLYMCKRWITTLSTNADRISKNILKRRAYIDAARSNPNA\n TVLTVTGGNTGDLPYQFGDTAH"\nORIGIN \n 1 atgtatccag tactttcgag agccgttgtg ggcaatccag aggaacgtgc gttaatggtg\n 61 tacccgccga cagcgccgat gccgcctgtc acgacttggg ataaccttaa aatcgacagt\n 121 gttgatggaa tgaaggatct agcattaaat atattggata agaatataac tagcacgacg\n 181 ggagcggatg agtgcgataa acgtgagaag gccatgttcg cctcggtggc ggaatcagca\n 241 gcagatagcc cgatggtgcg tacaattaaa atccagatat ataacagagt attggatgat\n 301 atggagagag agaagcggaa gtgtgagaaa agacgtgcaa tgttgagatt tgtctcaaac\n 361 gcctttataa cgttaatgct gatgtccaca ttcttgatgg ctatgatgca gaccccgccg\n 421 ataacgcagt atgtagagaa agcgtgtaat gggacgggag ggacggagac gaacgacccg\n 481 tgcggtctga tgagatggag tggggctgtc caatttttga tgctgataat gagcggcttt\n 541 ttgtatatgt gcaaacgttg gatcactacg ctttcaacga acgcagatag gattagtaaa\n 601 aacattttga aacggcgagc gtacatcgat gcagccagat caaacccaaa tgcgacggtt\n 661 ctaactgtga ctggaggcaa cacgggggat ctaccgtacc agttcgggga tacggcccat\n 721 tag\n//\n'
The compressed file can be accessed the same way:
aoc1Seqbgz = SeqIO.index("AOC1seq.gb.bgz", "genbank")
print(len(aoc1Seqbgz))
aoc1Seqbgz.close()
3
Bio.SeqIO.parse() has been used for sequence input (reading files) Bio.SeqIO.write() is for sequence output (writing files)
rec1 = SeqRecord(
Seq(
"MMYQQGCFAGGTVLRLAKDLAENNRGARVLVVCSEITAVTFRGPSETHLDSMVGQALFGD"
"GAGAVIVGSDPDLSVERPLYELVWTGATLLPDSEGAIDGHLREVGLTFHLLKDVPGLISK"
"NIEKSLKEAFTPLGISDWNSTFWIAHPGGPAILDQVEAKLGLKEEKMRATREVLSEYGNM"
"SSAC",
),
id="gi|14150838|gb|AAK54648.1|AF376133_1",
description="chalcone synthase [Cucumis sativus]",
)
rec2 = SeqRecord(
Seq(
"YPDYYFRITNREHKAELKEKFQRMCDKSMIKKRYMYLTEEILKENPSMCEYMAPSLDARQ"
"DMVVVEIPKLGKEAAVKAIKEWGQ",
),
id="gi|13919613|gb|AAK33142.1|",
description="chalcone synthase [Fragaria vesca subsp. bracteata]",
)
rec3 = SeqRecord(
Seq(
"MVTVEEFRRAQCAEGPATVMAIGTATPSNCVDQSTYPDYYFRITNSEHKVELKEKFKRMC"
"EKSMIKKRYMHLTEEILKENPNICAYMAPSLDARQDIVVVEVPKLGKEAAQKAIKEWGQP"
"KSKITHLVFCTTSGVDMPGCDYQLTKLLGLRPSVKRFMMYQQGCFAGGTVLRMAKDLAEN"
"NKGARVLVVCSEITAVTFRGPNDTHLDSLVGQALFGDGAAAVIIGSDPIPEVERPLFELV"
"SAAQTLLPDSEGAIDGHLREVGLTFHLLKDVPGLISKNIEKSLVEAFQPLGISDWNSLFW"
"IAHPGGPAILDQVELKLGLKQEKLKATRKVLSNYGNMSSACVLFILDEMRKASAKEGLGT"
"TGEGLEWGVLFGFGPGLTVETVVLHSVAT",
),
id="gi|13925890|gb|AAK49457.1|",
description="chalcone synthase [Nicotiana tabacum]",
)
my_records = [rec1, rec2, rec3]
With the list of SeqRecord objects, FASTA format file will be written.
SeqIO.write(my_records, "my_example.faa", "fasta")
3
count = SeqIO.convert("AOC1seq.gb", "genbank", "my_example.fasta", "fasta")
print("Converted %i records" % count)
Converted 3 records
records = SeqIO.parse("AOC1seq.gb", "genbank")
out_handle = StringIO()
SeqIO.write(records, out_handle, "fasta")
fasta_data = out_handle.getvalue()
print(fasta_data)
>AF358434.1 Pichia pastoris lysyl oxidase (AOC1) gene, complete cds TTCAAGGTTTTCGTTCTCATAGATAAGAGTTCCACGTGACCATTTGTATGGGGAGCTCTG ATGCCAAGACATGACATGCTTGTCGGATTATGCAAATCGTGCATACTACATGTAGTGGCT CAGATTTGTGAGAGATGAAATTTATCTTCTGCCGGAGCTATCAGTGCAATATCATTCTTA GGTCGATGCTGGGGGGTCGGTTTGAAGTGCGGAAGATAGGCTATGAAACGTGCTGTCGTT ATCAGTTCAATTGCTGCTGAATTGAAATATATAAGAGGGAAGTAATATCTCTCGTGGGTC CTTCTTTCCTTCTTCATCCCACTCATCTCATACCCTCTCCATTGACAAGGAAAAGAATGC GATTGACCAATCTTTTAAGTTTGACCACTTTGGTGGCCTTAGCTGTGGCTGTGCCAGACT TCTACCAGAAACGTGAAGCTGTTAGCTCCAAGGAAGCTGCTCTTCTGAGAAGAGATGCTT CTGCTGAATGCGTTTCCAACGAGAATGTCGAAATTGAGGCCCCAAAAACCAACATCTGGA CCAGTTTGGCCAAGGAAGAGGTTCAAGAAGTTTTGGACTTGTTGCATTCCACTTACAACA TTACCGAAGTCACTAAGGCTGACTTCTTCTCCAACTATGTTCTTTGGATTGAGACTTTGA AGCCTAATAAGACTGAGGCTTTGACTTACTTGGATGAGGATGGAGATCTACCACCCAGAA ACGCCAGAACTGTTGTCTACTTTGGTGAGGGTGAGGAAGGATACTTTGAGGAGCTCAAGG TTGGACCACTGCCAGTCAGTGATGAGACCACCATTGAACCTCTTTCCTTCTACAACACTA ATGGAAAGTCTAAGCTTCCATTCGAAGTTGGTCATTTGGACAGAATCAAGAGTGCTGCAA AATCATCGTTCTTGAATAAAAACTTGAACACCACAATCATGAGAGATGTTCTTGAAGGCT TGATTGGTGTTCCATACGAAGACATGGGATGTCACTCTGCTGCTCCTCAACTGCACGACC CAGCAACTGGAGCTACTGTCGACTATGGTACTTGTAACATCAACACTGAAAACGATGCTG AAAACTTGGTTCCAACTGGTTTCTTTTTCAAGTTCGATATGACCGGAAGAGATGTTTCTC AATGGAAAATGTTGGAGTACATTTACAACAACAAAGTCTACACTTCTGCTGAGGAACTCT ACGAGGCCATGCAGAAGGATGATTTTGTTACATTGCCAAAGATTGACGTTGATAACCTTG ACTGGACTGTTATTCAAAGAAACGATTCTGCCCCTATTAGACATTTGGATGACAGAAAGT CTCCAAGATTAGTAGAGCCAGAAGGACGCAGATGGGCCTACGATGGTGAAGAAGAATACT TCTCTTGGATGGACTGGGGTTTCTACACTTCTTGGAGTAGAGACACTGGTATCAGTTTCT ATGATATCACCTTCAAGGGTGAGAGAATCGTCTACGAATTATCTTTGCAAGAGTTAATTG CCGAATACGGTTCAGATGACCCATTCAACCAACACACCTTCTACTCTGACATTTCTTACG GTGTCGGTAACAGATTCAGTTTAGTCCCAGGTTACGACTGTCCAGCTACCGCCGGTTACT TCACTACTGACACTTTCGAATACGATGAATTCTACAACCGTACACTGAGCTACTGTGTTT TCGAAAACCAGGAGGACTACTCACTGCTACGTCACACTGGTGCTTCTTACTCTGCCATTA CTCAAAACCCAACTCTGAATGTTAGATTTATTTCTACCATTGGAAACTACGATTACAACT TCTTGTACAAATTCTTCTTAGACGGTACCCTTGAGGTCTCTGTTCGTGCTGCTGGTTACA TTCAAGCTGGATACTGGAACCCAGAAACCAGTGCTCCTTACGGTCTCAAGATCCACGATG TCTTATCAGGTTCTTTCCATGACCACGTTCTGAACTACAAAGTTGATCTGGATGTTGGTG GAACCAAGAACCGTGCTTCTAAGTACGTTATGAAAGATGTCGACGTTGAATACCCATGGG CCCCAGGTACGGTTTACAACACCAAGCAAATTGCAAGAGAGGTATTAGAGAAGGAAGACT TCAATGGTATTAACTGGCCTGAAAACGGACAAGGTATCCTTTTGATTGAGTCTGCTGAAG AGACCAACAGCTTCGGTAACCCAAGAGCTTACAACATTATGCCAGGAGGAGGAGGAGTCC ACAGAATTGTTAAGAACTCTCGTTCTGGACCAGAAACCCAAAACTGGGCTCGTTCCAACT TGTTCTTGACTAAGCACAAGGACGAAGAGTTAAGATCTTCTACTGCTTTGAACACCAACG CCCTTTACGACCCACCTGTGAACTTTAACGCTTTCTTAGACGATGAGAGCTTGGATGGTG AGGACATTGTTGCTTGGGTTAACTTGGGTCTGCACCACTTACCAAACTCCAACGATTTGC CAAACACTATCTTCTCAACCGCACATGCTTCCTTTATGTTGACACCATTCAACTACTTCG ACAGTGAGAACTCTAGAGATACCACCCAACAAGTTTTCTACACTTACGACGACGAGACTG AAGAATCTAACTGGGAGTTCTACGGAAACGACTGGAGTTCTTGTGGTCTTGAGGTTCCTG AACCAAACTTCGAGGACTACACCTACGGAAGAGGAACCCGTATCAACAAAAAGATGACCA ACTCTGACGAAGTCTACTAAATCAATCGATAGACTGAGTCTAACTTAGCTATGGAGCTAC CTCTCCATTTTTGAATTC >JF682784.1 Salvia miltiorrhiza allene oxide cyclase (AOC1) gene, complete cds AGAGAGAAGAAAAAGGTGAATCAATTAATTAGCCATGGCAGCTTCATCCGCCTCTACTAT TTTGAAGGTTGCCGTCTCCTCCCCTTCTCCCGCCAGACTGCCGCCGTCCGCCGCCTCCCA GAAACTCCCATCCAAGCAGAAACACTCCCTCCCTAAACCCCTCGCTCTCTCCACCACTAA ATCATTCTCATGCAGAGCACAGGCTGCAGCTGATTCAACACCCCGTCCCCGTAAGCTTTG CTTTTCTTCCATTTCAGATTCCTGATTTATTTTTTCTAAGAAATGAGAAATATTTGTTGT TCTAATTTAGAGAAAGTTCAAGAGCTGCACGTCTACGAGATCAACGAGCGTGATCGCGGC AGCCCCGCATACCTCCGATTGAGCCAAAAAACCGTCAATTCCCTCGGCGATCTCGTGCCT TTCAGCAACAAGGTACAATTAAATCTAATCTTTTTAATTATCACACTGCCCTGCCCTAAT TATTTAAATGTGCGAAAATCATGATCGGCCGTGGATGTGGATTTGCAGGTGTACACCGGC GACCTGAAGAAACGGGCTGGAATAACGGCGGGGATATGCATCCTGATAAAGAACGAAGCA GAGAAGAAGGGCGACCGGTACGAGGCCATCTACAGCTTCTACTTGGGCGACTACGGCCAC ATCGCCGTGCAGGGGCCCTACCTCACCTACCAAGACACCGAGCTCGCCGTCACCGGCGGC TCCGGCGTCTTCGAGGGCGTCTACGGCCACGTCAAGCTCCACCAGATCATCTTCCCCTTC AAACTCTTCTATACCTTCCACCTCAAGGGCATCCCCGACCTGCCGCCGGAGCTGCTCGCC CAGCCGGTGCCGCCCGCGCTCCACGTGGAGCCCACCCCCGCCGCCAAGACTTGTGAACCG GGAGCCACGCTCCCTAACTTTACCAATTAGTTTCTCCGTTGACGGGTCGTTGCTTCCATG TGTAATAAAT >KP300003.1 Gossypium hirsutum cultivar CCRI10 AOC1 (AOC1) mRNA, complete cds ATGGCATCCACCACCTCCCTCAAACCGATCCCTTCCATCAACTTGCCTTCCCAATCTCAC AGAGCTTTAGCTTCCAATTTCTCTTATTCAAAGCCTTTTCCCTTCCATGGCCTCAACCTC TCATCGGTCACTGAAACCAGTTCCTTCACCTCATCAAGATCCTCCAACCCCTTCACTACC ACTGCCTTCTTCTTCAACAAGTTCAAGCAGGAGGCCGCTCCACATACTCCCAAGCCCACA AAAGTTCAAGAGCTACACGTTTATGAAATGAACGAGAGAGACAGAAGCAGCCCTGCAGTT TTGAAACTAAGCCAAAAACCCGTAAACTCTCTGGGTGATCTGGTTCCTTTCACTAACAAG CTCTACTCTGGAGACTTGCAGAAAAGGGTGGGCATCACGGCTGGACTCTGTGTGCTGATC CAACACGTCCCGGAGAAAAAAGGCGATCGCTATGAAGCCATATACAGCTTCTACTTTGGT GACTACGGCCATTTGTCTGTCCAGGGACCCTATTTAACGTATGAGGATTCCTACTTGGCG GTGACGGGAGGATCTGGGATTTTCGAAGGAGCCTACGGACAGGTGAAGTTACATCAGATA GTGTTTCCCATGAAGCTGTATTATACATTCTACCTGAAAGGGATAGGTGATTTGCCAGCT GAGCTTCTAGGAAAGCCAGTGCCACCATCGCCTGCGGTGGAGCCCAGCGCGGCTGCTAAA GCTACGGAGCCCCATGGTAGTATTCCAAATTTTACCAACTGA
Working with the low-level SimpleFastaParser or FastqGeneralIterator is often more practical than Bio.SeqIO.parse when dealing with large high-throughput FASTA or FASTQ sequencing files where speed matters. As noted in the introduction to this chapter, the file-format neutral Bio.SeqIO interface has the overhead of creating many objects even for simple formats like FASTA.
When parsing FASTA files, internally Bio.SeqIO.parse() calls the low-level SimpleFastaParser with the file handle. You can use this directly - it iterates over the file handle returning each record as a tuple of two strings, the title line (everything after the > character) and the sequence (as a plain string):
count = 0
total_len = 0
with open("AOC1seq.fasta") as in_handle:
for title, seq in SimpleFastaParser(in_handle):
count += 1
total_len += len(seq)
print("%i records with total sequence length %i" % (count, total_len))
3 records with total sequence length 4510