BioPerl

briano at bioteam.net

#!/bin/perl

use strict;
use Bio::SeqIO;

# get the file name, somehow 
my $file = shift; 
my $seqio_object = Bio::SeqIO->new(-file => $file); 
my $seq_object = $seqio_object->next_seq;

# first, bring in the SeqIO module
use Bio::SeqIO;

# Notice that you do not have to use any Bio:SeqI
# objects, because SeqIO does this for you. In fact, it
# even knows which SeqI object to use for the provided format.

# Bring in the file and format, or die with a nice
# usage statement if one or both arguments are missing.
my $usage = "getaccs.pl file format "; 
my $file = shift or die $usage; 
my $format = shift or die $usage;

# Now create a new SeqIO object to bring in the input
# file. The new method takes arguments in the format
# key => value, key => value. The basic keys that it
# can accept values for are '-file' which expects some
# information on how to access your data, and '-format'
# which expects one of the Bioperl-format-labels mentioned
# above. Although it is optional, it is good
# programming practice to provide > and &lt; in front of any
# filenames provided in the -file parameter. This makes the
# resulting filehandle created by SeqIO explicitly read (&lt;)
# or write(>). It will definitely help others reading your
# code understand the function of the SeqIO object.

my $inseq = Bio::SeqIO->new(-file   => "$file",
                            -format => $format, );

# Now that we have a seq stream,
# we need to tell it to give us a $seq.
#  We do this using the 'next_seq' method of SeqIO.
while (my $seq = $inseq->next_seq) {
   print $seq->accession_number,"\n";
}

use strict; 
use Bio::SeqIO;

my $input_file = shift;
my $seq_in = Bio::SeqIO->new( -format => 'embl',
                              -file   => $input_file,
                             );

# loads the whole file into memory - be careful
# if this is a big file, then this script will
# use a lot of memory
my @seq_array; 
while ( my $seq = $seq_in->next_seq() ) {
   push(@seq_array,$seq);
}

# now do something with these. First sort by length,
# find the average and median lengths and print them out
@seq_array = sort { $a->length <=> $b->length } @seq_array;

my $total = 0; 
my $count = 0; 
for my $seq ( @seq_array ) {
   $total += $seq->length;
   $count++;
}

print "Mean length ", $total/$count, " Median ", 
      $seq_array[$count/2]->length, "\n";

use Bio::SeqIO;

# get command-line arguments, or die with a usage statement
my $usage = "x2y.pl infile infileformat outfile outfileformat "; 
my $infile = shift or die $usage; 
my $infileformat = shift or die $usage; 
my $outfile = shift or die $usage; 
my $outfileformat = shift or die $usage;

# create one SeqIO object to read in,and another to write out
my $seq_in = Bio::SeqIO->new( -file   => "$infile",
                              -format => $infileformat,
                            );

my $seq_out = Bio::SeqIO->new( -file   => ">$outfile",
                               -format => $outfileformat,
                             );

# write each entry in the input file to the output file
while (my $inseq = $seq_in->next_seq) {
   $seq_out->write_seq($inseq);
}

>cat myseqs.fa | all2y.pl fasta newseqs.gb genbank

use Bio::SeqIO;

# get command-line arguments, or die with a usage statement
my $usage = "all2y.pl informat outfile outfileformat "; 
my $informat = shift or die $usage; 
my $outfile = shift or die $usage; 
my $outformat = shift or die $usage;

# create one SeqIO object to read in, and another to write out
# - STDIN is a 'globbed' filehandle with the contents of Standard In
my $seqin = Bio::SeqIO->new( -fh     => *STDIN,
                             -format => $informat,
                           );

my $seqout = Bio::SeqIO->new( -file   => ">$outfile",
                              -format => $outformat,
                            );

# write each entry in the input file to the output file
while (my $inseq = $seqin->next_seq) {
   $seqout->write_seq($inseq);
}

cat *.seq | in2out.pl EMBL Genbank | someother program

use Bio::SeqIO;

# get command-line arguments, or die with a usage statement
my $usage = "in2out.pl informat outformat "; 
my $informat = shift or die $usage; 
my $outformat = shift or die $usage;

# create one SeqIO object to read in, and another to write out
my $seqin = Bio::SeqIO->new( -fh     => \*STDIN,
                             -format => $informat,
                           );

my $outseq = Bio::SeqIO->new( -fh     => \*STDOUT,
                              -format => $outformat,
                            );

# write each entry in the input to the output
while (my $inseq = $seqin->next_seq) {
   $outseq->write_seq($inseq);
}

use Bio::SeqIO;

# get a string into $string somehow, with its format in $format, 
# say from a web form.
my $string = ">SEQ1\nacgt\n>revseq1\ntgca "; 
my $format = "fasta";

my $stringfh = IO::String->new($string); 
open($stringfh, "<", $string) or die "Could not open string for reading: $!";

my $seqio = Bio::SeqIO-> new(-fh     => $stringfh,
                             -format => $format,
                            );

while( my $seq = $seqio->next_seq ) {
   # process each seq
   print $seq->id . ' = ' . $seq->seq() . "\n"; 
 }

use Bio::SeqIO;

my $string; 
my $stringfh = IO::String->new($string);
open($stringfh, ">", $string) or die "Could not open string for writing: $!";

my $seqOut = Bio::SeqIO->new( -format => 'swiss',
                              -fh     => $io,
                            );

$seqOut->write_seq($seq_obj); 
print $string;

 gzip2fasta.pl gbpri1.seq.gz Genbank gbpri1.fa

use Bio::SeqIO;

# get command-line arguments, or die with a usage statement
my $usage = "gzip2fasta.pl infile informat outfile "; 
my $infile = shift or die $usage; 
my $informat = shift or die $usage; 
my $outfile = shift or die $usage;

# create one SeqIO object to read in, and another to write out
my $seqin = Bio::SeqIO->new( -file   => "/usr/local/bin/gunzip -c $infile |",
                             -format => $informat,
                           );

my $seqout = Bio::SeqIO->new( -file   => ">$outfile",
                              -format => 'Fasta',
                            );

# write each entry in the input to the output file
while (my $inseq = $seqin->next_seq) {
   $seqout->write_seq($inseq);
}

any2wublastable.pl myfile.gb Genbank mywublastable p

use Bio::SeqIO;

# get command-line arguments, or die with a usage statement
my $usage = "any2wublastable.pl infile informat outdbname outdbtype "; 
my $infile = shift or die $usage; 
my $informat = shift or die $usage; 
my $outdbname = shift or die $usage; 
my $outdbtype = shift or die $usage;

# create one SeqIO object to read in, and another to write out
my $seqin = Bio::SeqIO->new( -file   => "$infile",
                             -format => $informat,
                           );

my $seqout = Bio::SeqIO->new( -file => "| /usr/local/bin/xdformat -o $outdbname -${outdbtype} -- -",
                            -format => 'Fasta',
                            );

# write each entry in the input to the output
while (my $inseq = $seqin->next_seq) {
   $seqout->write_seq($inseq);
}

 splitgb.pl inseq.gb

use Bio::SeqIO;

# get command-line argument, or die with a usage statement
my $usage = "splitgb.pl infile "; 
my $infile = shift or die $usage;
my $inseq = Bio::SeqIO->new( -file   => "$infile",
                             -format => 'Genbank',
                           );

my %outfiles = ( 'human' => Bio::SeqIO->new(
                                          -file   => '>human.gb',
                                          -format => 'Genbank',
                                          ),
                 'other' => Bio::SeqIO->new(
                                          -file   => '>other.gb',
                                          -format => 'Genbank',
                                          ),
               );

while (my $seqin = $inseq->next_seq) {
   # here we make use of the species attribute, which returns a
   # species object, which has a binomial attribute that
   # holds the binomial species name of the source of the sequence
   if ($seqin->species->binomial =~ m/Homo sapiens/) {
       $outfiles{'human'}->write_seq($seqin);
   } else {
       $outfiles{'other'}->write_seq($seqin);
   }
}

use Bio::SeqIO;

# get command-line argument, or die with a usage statement
my $usage = "splitgb.pl infile "; 
my $infile = shift or die $usage;
my $inseq = Bio::SeqIO->new( -file   => "<$infile",
                             -format => 'Genbank',
                           );

my %outfiles = ( human => {
                         Genbank => Bio::SeqIO->new(
                                                    -file   => '>human.gb',
                                                    -format => 'Genbank',
                                                    ),
                         Fasta   => Bio::SeqIO->new(
                                                    -file   => '>human.fa',
                                                    -format => 'Fasta',
                                                    ),
                         },
                 other => {
                         Genbank => Bio::SeqIO->new(
                                                    -file   => '>other.gb',
                                                    -format => 'Genbank',
                                                    ),
                         Fasta   => Bio::SeqIO->new(
                                                    -file => '>other.fa',
                                                    -format => 'Fasta',
                                                    ),
                         }
               );

while (my $seqin = $inseq->next_seq) {
   if ($seqin->species->binomial =~ m/Homo sapiens/) {
       $outfiles{'human'}->{'Genbank'}->write_seq($seqin);
       $outfiles{'human'}->{'Fasta'}->write_seq($seqin);
   } else {
       $outfiles{'other'}->{'Genbank'}->write_seq($seqin);
       $outfiles{'other'}->{'Fasta'}->write_seq($seqin);
   }
}

perl -MBio::SeqIO -e 'my $gss = 0; my $in = Bio::SeqIO->new(q(-file) => q(/usr/local/bin/gunzip -c gbpri1.seq.gz |), q(-format) => q(Genbank)); while (my $seq = $in->next_seq) { $gss++ if ($seq->keywords =~ m/GSS/);} print "There are $gss GSS sequences in gbpri1.seq.gz\n";'

  user@localhost ~/src/bioperl-live> perl t.pl bollocks silly
  ------------- EXCEPTION  -------------
  MSG: Could not open bollocks for reading: No such file or directory
  STACK Bio::Root::IO::_initialize_io Bio/Root/IO.pm:259
  STACK Bio::SeqIO::_initialize Bio/SeqIO.pm:441
  STACK Bio::SeqIO::genbank::_initialize Bio/SeqIO/genbank.pm:122
  STACK Bio::SeqIO::new Bio/SeqIO.pm:359
  STACK Bio::SeqIO::new Bio/SeqIO.pm:372
  STACK toplevel t.pl:9
  --------------------------------------

use strict; 
use Bio::SeqIO;

my $input_file = shift; 
my $output_file = shift;

# we have to declare $seq_in and $seq_out before
# the eval block as we want to use them afterwards
my $seq_in; 
my $seq_out;

eval {
   $seq_in   = Bio::SeqIO->new(
                               -format => 'genbank',
                               -file   => $input_file,
                               );
   $seq_out  = Bio::SeqIO->new(
                               -format => 'fasta',
                               -file   => ">$output_file",
                               );
}; 
# an error occurred
if( $@ ) {
   print "Was not able to open files, sorry!";
   print "Full error is $@ ";
   exit(-1);
} 

while( my $seq = $seq_in->next_seq() ) {
   $seq_out->write_seq($seq);
}

#!/usr/bin/perl

use strict; 
use Bio::SeqIO; 
use Benchmark qw(:all);

my $file = "gbbct10.seq";

timethis(1, sub {
    my $in = Bio::SeqIO->new(-file => $file, -format => "genbank");
    for (1..1000) {
        my $seq = $in->next_seq;
    }
});

timethis(1, sub {
    my $in = Bio::SeqIO->new(-file => $file, -format => "genbank");
    my $builder = $in->sequence_builder();
    $builder->want_none();
    $builder->add_wanted_slot('display_id','desc','seq');
    for (1..1000) {
       my $seq = $in->next_seq;
    }
});

`timethis 1: 10 wallclock secs ( 9.64 usr +  0.02 sys =  9.66 CPU) @  0.10/s (n=1)
            (warning: too few iterations for a reliable count)
`timethis 1:  1 wallclock secs ( 1.63 usr +  0.00 sys =  1.63 CPU) @  0.61/s (n=1)
            (warning: too few iterations for a reliable count)

Name	Description	File extension
abi	ABI tracefile	ab[i1]
ace	Ace database	ace
agave	AGAVE XML
alf	ALF tracefile	alf
asciitree	write-only, to visualize features
bsml	BSML using	bsm,bsml
bsml_sax	BSML, using
chadoxml	CHADO sequence format
chaos	CHAOS sequence format
chaosxml	Chaos XML
ctf	CTF tracefile	ctf
embl	EMBL database	embl,ebl,emb,dat
entrezgene	Entrez Gene ASN1
excel	Excel
exp	Staden EXP format	exp
fasta	FASTA	fasta,fast,seq,fa,fsa,nt,aa
fastq	quality score data in FASTA-like format	fastq
flybase_chadoxml	variant of Chado XML
game	GAME XML
gcg	GCG	gcg
genbank	GenBank	gb	gbank	genbank
interpro	InterProScan XML
kegg	KEGG
largefasta	Large files, fasta format
lasergene	Lasergene format
locuslink	LocusLink
metafasta
phd	Phred	phd,phred
pir	PIR database	pir
pln	PLN tracefile	pln
qual	Phred
raw	plain text	txt
scf	Standard Chromatogram Format	scf
seqxml	SeqXML sequence format	xml
strider	DNA Strider format
swiss	SwissProt	swiss,sp
tab	tab-delimited
table	Table
tigr	TIGR XML
tigrxml	TIGR Coordset XML
tinyseq	NCBI TinySeq XML
ztr	ZTR tracefile	ztr

Format	Object Type
fasta	Bio::Seq
genbank	Bio::Seq::RichSeq
pir	Bio::Seq
embl	Bio::Seq::RichSeq
raw	Bio::Seq
ace	Bio::PrimarySeq
bsml	Bio::Seq::RichSeq
swiss	Bio::Seq::RichSeq

Authors

Copyright

The basics

10 second overview

Background Information

Formats

Working Examples

To and From a String

And more examples…

Caveats

Error Handling

Speed