[Bioperl-guts-l] [16955] bioperl-live/trunk: Partial redesign to simplify/ clarify the internal code of B::A::T::ContigSpectrum

Florent E Angly fangly at dev.open-bio.org
Tue Apr 27 00:28:55 EDT 2010


Revision: 16955
Author:   fangly
Date:     2010-04-27 00:28:55 -0400 (Tue, 27 Apr 2010)
Log Message:
-----------
Partial redesign to simplify/clarify the internal code of B::A::T::ContigSpectrum

Modified Paths:
--------------
    bioperl-live/trunk/Bio/Assembly/Tools/ContigSpectrum.pm
    bioperl-live/trunk/t/Assembly/ContigSpectrum.t

Modified: bioperl-live/trunk/Bio/Assembly/Tools/ContigSpectrum.pm
===================================================================
--- bioperl-live/trunk/Bio/Assembly/Tools/ContigSpectrum.pm	2010-04-26 16:44:21 UTC (rev 16954)
+++ bioperl-live/trunk/Bio/Assembly/Tools/ContigSpectrum.pm	2010-04-27 04:28:55 UTC (rev 16955)
@@ -141,8 +141,7 @@
 
     to_string       create a string representation of the spectrum
     spectrum        import a hash contig spectrum
-    contig          determine a contig spectrum from a contig
-    assembly        determine a contig spectrum from an assembly
+    assembly        determine a contig spectrum from an assembly, contig or singlet
     dissolve        calculate a dissolved contig spectrum (depends on assembly)
     cross           produce a cross contig spectrum (depends on assembly)
     add             add a contig spectrum to an existing one
@@ -553,39 +552,17 @@
   return $spectrum;
 }
 
-=head2 contig
 
-  Title   : contig
-  Usage   : my @obj_list = $csp->contig();
-  Function: Update the contig spectrum object by adding a contig or singlet
-            object / get a reference to the list of assembly, contig and singlet
-            objects used in the contig spectrum.
-  Returns : array reference of Bio::Assembly::Scaffold, Bio::Assembly::Contig and
-            Bio::Assembly::Singlet objects
-  Args    : Bio::Assembly::Contig or Bio::Assembly::Singlet object
-
-=cut
-
-sub contig {
-  my ($self, $contig) = @_;
-  if (defined $contig) {
-    $self->_import_contig($contig);
-  }
-  my @obj_list = @{$self->{'_assembly'}} if defined $self->{'_assembly'};
-  return \@obj_list;
-}
-
-
 =head2 assembly
 
   Title   : assembly
   Usage   : my @obj_list = $csp->assembly();
-  Function: Update the contig spectrum object by adding an assembly object / get
-            a reference to the list of assembly, contig and singlet objects used
-            in the contig spectrum object.
-  Returns : array reference of Bio::Assembly::Scaffold, Bio::Assembly::Contig and
-            Bio::Assembly::Singlet objects
-  Args    : Bio::Assembly::Scaffold object
+  Function: Update the contig spectrum object by adding an assembly, contig or
+            singlet object to it
+  Returns : arrayref of assembly, contig and singlet objects used in the contig
+            spectrum object (Bio::Assembly::Scaffold, Bio::Assembly::Contig and
+            Bio::Assembly::Singlet objects)
+  Args    : Bio::Assembly::Scaffold, Contig or Singlet object
 
 =cut
 
@@ -594,8 +571,24 @@
   if (defined $assembly) {
     $self->_import_assembly($assembly);
   }
+  return $self->get_assembly();
+}
+
+
+=head2 get_assembly
+
+  Title   : get_assembly
+  Usage   : $csp->get_assembly();
+  Function: Get all assembly objects associated with a contig spectrum.
+  Returns : array reference of Bio::Assembly::Scaffold, Contig and Singlet objects
+  Args    : none
+
+=cut
+
+sub get_assembly {
+  my ($self) = @_;
   my @obj_list = @{$self->{'_assembly'}} if defined $self->{'_assembly'};
-  return \@obj_list;
+  return @obj_list;
 }
 
 
@@ -959,9 +952,9 @@
   Title   : _new_from_assembly
   Usage   : 
   Function: Creates a new contig spectrum object based solely on the result of 
-            an assembly
-  Returns : Bio::Assembly::Tools::ContigSpectrum
-  Args    : Bio::Assembly::Scaffold
+            an assembly, contig or singlet
+  Returns : Bio::Assembly::Tools::ContigSpectrum object
+  Args    : Bio::Assembly::Scaffold, Contig or Singlet object
 
 =cut
 
@@ -985,7 +978,7 @@
   # 3: Set sequence statistics: nof_seq and avg_seq_len
   ($csp->{'_avg_seq_len'}, $csp->{'_nof_seq'}) = $self->_get_assembly_seq_stats($assemblyobj);
   # 4: Set the spectrum: spectrum and max_size
-  for my $contigobj ($assemblyobj->all_contigs) {
+  for my $contigobj ( $self->_get_contig_like($assemblyobj) ) {
     my $size = $contigobj->num_sequences;
     if (defined $csp->{'_spectrum'}{$size}) {
       $csp->{'_spectrum'}{$size}++;
@@ -994,11 +987,6 @@
     }
     $csp->{'_max_size'} = $size if $size > $csp->{'_max_size'};
   }
-  my $nof_singlets = $assemblyobj->get_nof_singlets();
-  if (defined $nof_singlets) {
-    $csp->{'_spectrum'}{1} += $nof_singlets;
-    $csp->{'_max_size'} = 1 if $nof_singlets >= 1 && $csp->{'_max_size'} < 1;
-  }
   # 5: Set list of assembly objects used
   push @{$csp->{'_assembly'}}, $assemblyobj;
   # 6: Set number of repetitions
@@ -1007,48 +995,6 @@
 }
 
 
-=head2 _new_from_contig
-
-  Title   : _new_from_contig
-  Usage   :
-  Function: Creates a new contig spectrum object based solely on a contig or
-            singlet
-  Returns : Bio::Assembly::Tools::ContigSpectrum
-  Args    : Bio::Assembly::Contig or Bio::Assembly::Singlet
-
-=cut
-
-sub _new_from_contig {
-  # Create new contig spectrum object based purely on what we can get from a
-  # contig object
-  my ($self, $contigobj) = @_;
-  my $csp = Bio::Assembly::Tools::ContigSpectrum->new();
-  # 1: Set id
-  $csp->{'_id'} = $contigobj->id;
-  # 2: Set overlap statistics: nof_overlaps, min_overlap, avg_overlap,
-  #  min_identity and avg_identity
-  $csp->{'_eff_asm_params'} = $self->{'_eff_asm_params'};
-  $csp->{'_min_overlap'}    = $self->{'_min_overlap'};
-  $csp->{'_min_identity'}   = $self->{'_min_identity'};
-  if ($csp->{'_eff_asm_params'} > 0) {
-     ( $csp->{'_avg_overlap'}, $csp->{'_avg_identity'}, $csp->{'_min_overlap'}, 
-       $csp->{'_min_identity'}, $csp->{'_nof_overlaps'} )
-       = $csp->_get_contig_overlap_stats($contigobj);
-  }
-  # 3: Set sequence statistics: nof_seq and avg_seq_len
-  ($csp->{'_avg_seq_len'}, $csp->{'_nof_seq'}) = $csp->_get_contig_seq_stats($contigobj);
-  # 4: Set the spectrum: spectrum and max_size
-  my $size = $contigobj->num_sequences;
-  $csp->{'_spectrum'}{$size} = 1;
-  $csp->{'_max_size'} = $size;
-  # 5: Set list of assembly objects used
-  push @{$csp->{'_assembly'}}, $contigobj;
-  # 6: Set number of repetitions
-  $csp->{'_nof_rep'} = 1;
-  return $csp;
-}
-
-
 =head2 _new_dissolved_csp
 
   Title   : 
@@ -1109,42 +1055,28 @@
   my $asm_spectrum = { 1 => 0 };
   my $good_seqs = {};
   for my $obj (@{$mixed_csp->{'_assembly'}}) {
+    
     # Dissolve this assembly/contig/singlet for the given sequences
-    if ($obj->isa('Bio::Assembly::Scaffold')) {
-      my $assembly = $obj;
-      # For each contig/singlet
-      for my $contig ($assembly->all_contigs, $assembly->all_singlets) {
-         ($asm_spectrum, $good_seqs) = $self->_dissolve_contig($dissolved, $contig, $seq_header, $asm_spectrum, $good_seqs);
-      }
-    } elsif ($obj->isa('Bio::Assembly::Contig')) {
-      # a contig or singlet
-      my $contig = $obj;
-      ($asm_spectrum, $good_seqs) = $self->_dissolve_contig($dissolved, $contig, $seq_header, $asm_spectrum, $good_seqs);
+    for my $contig ( $self->_get_contig_like($obj) ) {
+      ($asm_spectrum, $good_seqs) = $self->_dissolve_contig($dissolved, $contig,
+        $seq_header, $asm_spectrum, $good_seqs);
     }
 
     # Update spectrum
     $dissolved->_import_spectrum($asm_spectrum);
+
     # Update nof_rep
     $dissolved->{'_nof_rep'}--;
     $dissolved->{'_nof_rep'} += $mixed_csp->{'_nof_rep'};
 
     # Get sequence and overlap stats
-    if ($obj->isa('Bio::Assembly::Scaffold')) {
-      ($dissolved->{'_avg_seq_len'}, $dissolved->{'_nof_seq'}) =
-        $dissolved->_get_assembly_seq_stats($obj, $good_seqs);
-      if ($dissolved->{'_eff_asm_params'} > 0) {
-        ( $dissolved->{'_avg_overlap'}, $dissolved->{'_avg_identity'}, $dissolved->{'_min_overlap'},
-          $dissolved->{'_min_identity'}, $dissolved->{'_nof_overlaps'} )
-          = $dissolved->_get_assembly_overlap_stats($obj, $good_seqs);
-      }
-    } elsif ($obj->isa('Bio::Assembly::Contig')) {
-      ($dissolved->{'_avg_seq_len'}, $dissolved->{'_nof_seq'}) =
-        $dissolved->_get_contig_seq_stats($obj, $good_seqs);
-      if ($dissolved->{'_eff_asm_params'} > 0) {
-        ( $dissolved->{'_avg_overlap'}, $dissolved->{'_avg_identity'}, $dissolved->{'_min_overlap'},
-          $dissolved->{'_min_identity'}, $dissolved->{'_nof_overlaps'} )
-          = $dissolved->_get_contig_overlap_stats($obj, $good_seqs);
-      }
+    ($dissolved->{'_avg_seq_len'}, $dissolved->{'_nof_seq'}) =
+      $dissolved->_get_assembly_seq_stats($obj, $good_seqs);
+    if ($dissolved->{'_eff_asm_params'} > 0) {
+      ( $dissolved->{'_avg_overlap'}, $dissolved->{'_avg_identity'},
+        $dissolved->{'_min_overlap'}, $dissolved->{'_min_identity'},
+        $dissolved->{'_nof_overlaps'} ) 
+        = $dissolved->_get_assembly_overlap_stats($obj, $good_seqs);
     }
 
   }
@@ -1175,7 +1107,9 @@
 
   # Update spectrum
   my $size = scalar @contig_seqs;
-  if ($size == 1) {
+  if ($size == 0) {
+    # do nothing
+  } elsif ($size == 1) {
     $$asm_spectrum{1}++;
   } elsif ($size > 1) {
     # Reassemble good sequences
@@ -1186,7 +1120,9 @@
     for my $qsize (keys %$contig_spectrum) {
       $$asm_spectrum{$qsize} += $$contig_spectrum{$qsize};
     }
-  } 
+  } else {
+     $self->throw("The size is not valid... how could that happen?");
+  }
 
   return $asm_spectrum, $good_seqs;
 }
@@ -1237,35 +1173,23 @@
   my $spectrum = {1 => 0};
   my $good_seqs = {};
   for my $obj (@{$mixed_csp->{'_assembly'}}) {
-    if ($obj->isa('Bio::Assembly::Scaffold')) {
-      # Go through contigs and skip the pure ones
-      my $assembly = $obj;
-      for my $contig ($assembly->all_contigs) {
-        ($spectrum, $good_seqs) = $self->_cross_contig($cross, $contig, $spectrum, $good_seqs);
-      }
-      # Get sequence stats
-      ($cross->{'_avg_seq_len'}, $cross->{'_nof_seq'}) = $cross->_get_assembly_seq_stats($assembly, $good_seqs);
-      # Get eff_asm_param for these sequences
-      if ($cross->{'_eff_asm_params'} > 0) {
-        ( $cross->{'_avg_overlap'}, $cross->{'_avg_identity'}, $cross->{'_min_overlap'},
-          $cross->{'_min_identity'}, $cross->{'_nof_overlaps'} )
-          = $cross->_get_assembly_overlap_stats($assembly, $good_seqs);
-      }
-    } elsif ($obj->isa('Bio::Assembly::Contig')) {
-      my $contig = $obj;
-      ($spectrum, $good_seqs) = $self->_cross_contig($cross, $contig, $spectrum, $good_seqs);
-      # Get sequence stats

@@ Diff output truncated at 10000 characters. @@


More information about the Bioperl-guts-l mailing list