[Bioperl-guts-l] [16333] bioperl-run/trunk: Made the assemblers wrapper (Phrap, Cap3, TigrAssembler) share the same interfaces and code created in Bio::Tools:: Run::AssemblerBase
Florent E Angly
fangly at dev.open-bio.org
Thu Nov 5 04:26:10 EST 2009
Revision: 16333
Author: fangly
Date: 2009-11-05 04:26:09 -0500 (Thu, 05 Nov 2009)
Log Message:
-----------
Made the assemblers wrapper (Phrap, Cap3, TigrAssembler) share the same interfaces and code created in Bio::Tools::Run::AssemblerBase
Modified Paths:
--------------
bioperl-run/trunk/lib/Bio/Tools/Run/Cap3.pm
bioperl-run/trunk/lib/Bio/Tools/Run/Phrap.pm
bioperl-run/trunk/lib/Bio/Tools/Run/TigrAssembler.pm
bioperl-run/trunk/t/Cap3.t
bioperl-run/trunk/t/Phrap.t
bioperl-run/trunk/t/TigrAssembler.t
Added Paths:
-----------
bioperl-run/trunk/lib/Bio/Tools/Run/AssemblerBase.pm
bioperl-run/trunk/t/data/sample_dataset_1.qual
Added: bioperl-run/trunk/lib/Bio/Tools/Run/AssemblerBase.pm
===================================================================
--- bioperl-run/trunk/lib/Bio/Tools/Run/AssemblerBase.pm (rev 0)
+++ bioperl-run/trunk/lib/Bio/Tools/Run/AssemblerBase.pm 2009-11-05 09:26:09 UTC (rev 16333)
@@ -0,0 +1,472 @@
+package Bio::Tools::Run::AssemblerBase;
+
+use strict;
+use Bio::SeqIO;
+use Bio::Assembly::IO;
+
+use base qw(Bio::Root::Root Bio::Tools::Run::WrapperBase);
+
+our $default_out_type = 'Bio::Assembly::ScaffoldI';
+
+=head2 program_name
+
+ Title : program_name
+ Usage : $assembler>program_name()
+ Function: get/set the executable name
+ Returns: string
+ Args : string
+
+=cut
+
+sub program_name {
+ my ($self, $val) = @_;
+ $self->{'_program_name'} = $val if $val;
+ return $self->{'_program_name'};
+}
+
+
+=head2 program_dir
+
+ Title : program_dir
+ Usage : $assembler->program_dir()
+ Function: get/set the program dir
+ Returns: string
+ Args : string
+
+=cut
+
+sub program_dir {
+ my ($self, $val) = @_;
+ $self->{'_program_dir'} = $val if $val;
+ return $self->{'_program_dir'};
+}
+
+
+=head2 out_type
+
+ Title : out_type
+ Usage : $assembler->out_type('Bio::Assembly::ScaffoldI')
+ Function: get/set the desired type of output
+ Returns : The type of results to return
+ Args : Type of results to return (optional):
+ 'Bio::Assembly::IO' object
+ 'Bio::Assembly::ScaffoldI' object (default)
+ The name of a file to save the results in
+
+=cut
+
+sub out_type {
+ my ($self, $val) = @_;
+ if (defined $val) {
+ $self->{'_out_type'} = $val;
+ } else {
+ if (not defined $self->{'_out_type'}) {
+ $self->{'_out_type'} = $default_out_type;
+ }
+ }
+ return $self->{'_out_type'};
+}
+
+
+=head2 _assembly_format
+
+ Title : _assembly_format
+ Usage : $assembler->_assembly_format('tigr')
+ Function: get/set the driver to use to parse the assembly results
+ Returns : the driver to use to parse the assembly results
+ Args : the driver to use to parse the assembly results (optional)
+
+=cut
+
+sub _assembly_format {
+ my ($self, $asm_format) = @_;
+ if (defined $asm_format) {
+ $self->{'_assembly_format'} = $asm_format;
+ }
+ return $self->{'_assembly_format'};
+}
+
+
+=head2 _check_executable
+
+ Title : _check_executable
+ Usage : $assembler->_check_executable()
+ Function: Verifies that the program executable can be found, or throw an error.
+ Returns: 1 for success
+ Args : -
+
+=cut
+
+sub _check_executable {
+ my ($self) = @_;
+ if (not defined $self->executable()) {
+ $self->throw("Could not find the executable '".$self->program_name()."'. ".
+ 'You can use $self->program_dir() and $self->program_name() to '.
+ "specify the location of the program.");
+ }
+ return 1;
+}
+
+=head2 _check_sequence_input
+
+ Title : _check_sequence_input
+ Usage : $assembler->_check_sequence_input($seqs)
+ Function: Check that the sequence input is a valid file, or an arrayref of
+ sequence objects (Bio::PrimarySeqI or Bio::SeqI). If not, an
+ exception is thrown.
+ Returns : 1 if the check passed
+ Args : sequence input
+
+=cut
+
+sub _check_sequence_input {
+ my ($self, $seqs) = @_;
+ if (not $seqs) {
+ $self->throw("Must supply sequences as a FASTA filename or a sequence object".
+ " (Bio::PrimarySeqI or Bio::SeqI) array reference");
+ } else {
+ if (ref($seqs) =~ m/ARRAY/i ) {
+ for my $seq (@$seqs) {
+ unless ($seq->isa('Bio::PrimarySeqI') || $seq->isa('Bio::SeqI')) {
+ $self->throw("Not a valid Bio::PrimarySeqI or Bio::SeqI object");
+ }
+ }
+ } else {
+ if (not -f $seqs) {
+ $self->throw("Input file '$seqs' does not seem to exist.");
+ }
+ }
+ }
+ return 1;
+}
+
+=head2 _check_optional_quality_input
+
+ Title : _check_optional_quality_input
+ Usage : $assembler->_check_optional_quality_input($quals)
+ Function: If a quality score input is provided, check that it is either a
+ valid file or an arrayref of quality score objects (Bio::Seq::
+ QualI or Bio::Seq::Quality). If not, an exception is thrown.
+ Returns : 1 if the check passed (or quality score input was provided)
+ Args : quality score input
+
+=cut
+
+sub _check_optional_quality_input {
+ my ($self, $quals) = @_;
+ if (defined $quals) {
+ if (ref($quals) =~ m/ARRAY/i) {
+ for my $qual (@$quals) {
+ unless ($qual->isa('Bio::Seq::QualI') || $qual->isa('Bio::Seq::Quality')) {
+ $self->throw("Not a valid Bio::Seq::QualI or Bio::Seq::Quality object");
+ }
+ }
+ } else {
+ if (not -f $quals) {
+ $self->throw("Input file '$quals' does not seem to exist.");
+ }
+ }
+ }
+ return 1;
+}
+
+
+=head2 _prepare_input_file
+
+ Title : _prepare_input_file
+ Usage : ($fasta_file, $qual_file) = $assembler->_prepare_input_file(\@seqs, \@quals);
+ Function: Create the input FASTA and QUAL files as needed. If the input
+ sequences are provided in a (FASTA) file, the optional input quality
+ scores are also expected to be in a (QUAL) file. If the input
+ sequences are an arrayref of bioperl sequence objects, the optional
+ input quality scores are expected to be an arrayref of bioperl
+ quality score objects, in the same order as the sequence objects.
+ Returns : - input filehandle
+ - input filename
+ Args : - sequence input (FASTA file or sequence object arrayref)
+ - optional quality score input (QUAL file or quality score object
+ arrayref)
+
+=cut
+
+sub _prepare_input_files {
+ my ($self, $seqs, $quals) = @_;
+ # Set up input FASTA and QUAL files
+ $self->io->_initialize_io();
+ #$self->tempdir();
+ my $fasta_file;
+ my $qual_file;
+ if ( ref($seqs) =~ m/ARRAY/i ) {
+ # Input sequences are an arrayref of Bioperl sequence objects
+ if (defined $quals && not ref($quals) =~ m/ARRAY/i) {
+ $self->throw("The input sequences are an arrayref of sequence objects. ".
+ "Expecting the quality scores as an arrayref of quality score objects");
+ } else {
+ # The input qualities are not defined or are an arrayref of quality objects
+ # Write temp FASTA and QUAL input files
+ ($fasta_file, $qual_file) = $self->_write_seq_file($seqs, $quals);
+ }
+ } else {
+ # Sequence input is a FASTA file
+ $fasta_file = $seqs;
+ if (defined $quals && ref($quals) =~ m/ARRAY/i) {
+ # Quality input is defined and is an arrayref of quality objects
+ $self->throw("The input sequences are in a FASTA file. Expecting the ".
+ "quality scores in a QUAL file.");
+ } else {
+ # Input quality scores is either not defined or is a QUAL file
+ $qual_file = $quals;
+ }
+ }
+ return $fasta_file, $qual_file;
+}
+
+
+=head2 _write_seq_file
+
+ Title : _write_seq_file
+ Usage : ($fasta_file, $qual_file) = $assembler->_write_seq_file(\@seqs, \@quals)
+ Function: Write temporary FASTA and QUAL files on disk
+ Returns : name of FASTA file
+ name of QUAL file (undef if no quality scoress)
+ Args : - arrayref of sequence objects
+ - optional arrayref of quality score objects
+
+=cut
+
+sub _write_seq_file {
+ my ($self, $seqs, $quals) = @_;
+ # Store the sequences in temporary FASTA files
+ my $tmpdir = $self->tempdir();
+ my ($fasta_h, $fasta_file) = $self->io->tempfile( -dir => $tmpdir );
+ my ($qual_h, $qual_file ) = $self->io->tempfile( -dir => $tmpdir );
+ my $fasta_out = Bio::SeqIO->new( -fh => $fasta_h , -format => 'fasta');
+ my $qual_out = Bio::SeqIO->new( -fh => $qual_h , -format => 'qual' );
+ my $use_qual_file = 0;
+ my $size = scalar @$seqs;
+ for ( my $i = 0 ; $i < $size ; $i++ ) {
+ my $seq = $$seqs[$i];
+ # Make sure that all sequences have an ID (to prevent TIGR Assembler crash)
+ if (not defined $seq->id) {
+ my $newid = 'tmp'.$i;
+ print $newid."\n";
+ $seq->id($newid);
+ $self->warn("A sequence had no ID. Its ID is now $newid");
+ }
+ my $seqid = $seq->id;
+ # Write the FASTA entries in files (and QUAL if appropriate)
+ $fasta_out->write_seq($seq);
+ if ($seq->isa('Bio::Seq::Quality')) {
+ # Quality scores embedded in seq object
+ if (scalar @{$seq->qual} > 0) {
+ $qual_out->write_seq($seq);
+ $use_qual_file = 1;
+ }
+ } else {
+ # Quality score in a different object from the sequence object
+ my $qual = $$quals[$i];
+ if (defined $qual) {
+ my $qualid = $qual->id;
+ if ($qualid eq $seqid) {
+ # valid quality score information
+ $qual_out->write_seq($qual);
+ $use_qual_file = 1;
+ } else {
+ # ID mismatch between sequence and quality score
+ $self->warn("Sequence object with ID $seqid does not match quality ".
+ "score object with ID $qualid");
+ }
+ }
+ }
+ }
+ close($fasta_h);
+ close($qual_h);
+ $fasta_out->close();
+ $qual_out->close();
+ return undef if scalar @$seqs <= 0;
+ $qual_file = undef if $use_qual_file == 0;
+ return $fasta_file, $qual_file;
+}
+
+
+=head2 _prepare_output_file
+
+ Title : _prepare_output_file
+ Usage : ($out_fh, $out_file) = $assembler->_prepare_output_file( );
+ Function: Prepare the output file
+ Returns : - output filehandle
+ - output filename
+ Args : none
+
+=cut
+
+sub _prepare_output_file {
+ my ($self) = @_;
+ my ($output_fh, $output_file);
+ my $out_type = $self->out_type();
+ if ( (not $out_type eq 'Bio::Assembly::ScaffoldI') &&
+ (not $out_type eq 'Bio::Assembly::IO' ) ) {
+ # Output is a file with specified name
+ $output_file = $out_type;
+ open $output_fh, '>', $output_file or $self->throw("Could not write file ".
+ "'$output_file': $!");
+ } else {
+ ( $output_fh, $output_file ) = $self->io->tempfile( -dir => $self->tempdir() );
+ }
+ $self->outfile_name($output_file);
+ return $output_fh, $output_file;
+}
+
+=head2 _export_results
+
+ Title : _export_results
+ Usage : $results = $assembler->_export_results($asm_file);
+ Function: Export the assembly results
@@ Diff output truncated at 10000 characters. @@
More information about the Bioperl-guts-l
mailing list