[Bioperl-guts-l] [16619] bioperl-live/trunk/Bio/Assembly/Tools/ContigSpectrum.pm: Improvements in the calculation of the contig spectrum effective assembly parameters

Florent E Angly fangly at dev.open-bio.org
Thu Jan 7 19:38:37 EST 2010


Revision: 16619
Author:   fangly
Date:     2010-01-07 19:38:37 -0500 (Thu, 07 Jan 2010)
Log Message:
-----------
Improvements in the calculation of the contig spectrum effective assembly parameters

Modified Paths:
--------------
    bioperl-live/trunk/Bio/Assembly/Tools/ContigSpectrum.pm

Modified: bioperl-live/trunk/Bio/Assembly/Tools/ContigSpectrum.pm
===================================================================
--- bioperl-live/trunk/Bio/Assembly/Tools/ContigSpectrum.pm	2010-01-07 22:36:57 UTC (rev 16618)
+++ bioperl-live/trunk/Bio/Assembly/Tools/ContigSpectrum.pm	2010-01-08 00:38:37 UTC (rev 16619)
@@ -975,6 +975,8 @@
   # 2: Set overlap statistics: nof_overlaps, min_overlap, avg_overlap,
   #  min_identity and avg_identity
   $csp->{'_eff_asm_params'} = $self->{'_eff_asm_params'};
+  $csp->{'_min_overlap'}    = $self->{'_min_overlap'};
+  $csp->{'_min_identity'}   = $self->{'_min_identity'};
   if ($csp->{'_eff_asm_params'} > 0) {
     ( $csp->{'_avg_overlap'}, $csp->{'_avg_identity'}, $csp->{'_min_overlap'},
       $csp->{'_min_identity'}, $csp->{'_nof_overlaps'} )
@@ -1026,6 +1028,8 @@
   # 2: Set overlap statistics: nof_overlaps, min_overlap, avg_overlap,
   #  min_identity and avg_identity
   $csp->{'_eff_asm_params'} = $self->{'_eff_asm_params'};
+  $csp->{'_min_overlap'}    = $self->{'_min_overlap'};
+  $csp->{'_min_identity'}   = $self->{'_min_identity'};
   if ($csp->{'_eff_asm_params'} > 0) {
      ( $csp->{'_avg_overlap'}, $csp->{'_avg_identity'}, $csp->{'_min_overlap'}, 
        $csp->{'_min_identity'}, $csp->{'_nof_overlaps'} )
@@ -1053,7 +1057,6 @@
   Returns : 
   Args    : 
 
-
 =cut
 
 sub _new_dissolved_csp {
@@ -1085,19 +1088,22 @@
   # New dissolved contig spectrum object
   my $dissolved = Bio::Assembly::Tools::ContigSpectrum->new();
 
-  # take parent attributes if existent or mixed attributes otherwise
+  # Take parent attributes if existent or mixed attributes otherwise
   if ($self->{'_eff_asm_params'}) {
     $dissolved->{'_eff_asm_params'} = $self->{'_eff_asm_params'};
   } else {
     $dissolved->{'_eff_asm_params'} = $mixed_csp->{'_eff_asm_params'};
   }
-  if ($self->{'_min_overlap'} && $self->{'_min_identity'}) {
-    ( $dissolved->{'_min_overlap'}, $dissolved->{'_min_identity'} ) = 
-      ( $self->{'_min_overlap'}, $self->{'_min_identity'} );
+  if ($self->{'_min_overlap'}) {
+    $dissolved->{'_min_overlap'} = $self->{'_min_overlap'};
   } else {
-    ( $dissolved->{'_min_overlap'}, $dissolved->{'_min_identity'} ) = 
-      ( $mixed_csp->{'_min_overlap'}, $mixed_csp->{'_min_identity'} );
+    $dissolved->{'_min_overlap'} = $mixed_csp->{'_min_overlap'};
   }
+  if ($self->{'_min_identity'}) {
+    $dissolved->{'_min_identity'} = $self->{'_min_identity'};
+  } else {
+    $dissolved->{'_min_identity'} = $mixed_csp->{'_min_identity'};
+  }
 
   # Dissolve each assembly
   my $asm_spectrum = { 1 => 0 };
@@ -1220,13 +1226,16 @@
   } else {
     $cross->{'_eff_asm_params'} = $mixed_csp->{'_eff_asm_params'};
   }
-  if ($self->{'_min_overlap'} && $self->{'_min_identity'}) {
-    ( $cross->{'_min_overlap'}, $cross->{'_min_identity'} ) = 
-      ( $self->{'_min_overlap'}, $self->{'_min_identity'} );
+  if ($self->{'_min_overlap'}) {
+    $cross->{'_min_overlap'} = $self->{'_min_overlap'};
   } else {
-    ( $cross->{'_min_overlap'}, $cross->{'_min_identity'} ) = 
-      ( $mixed_csp->{'_min_overlap'}, $mixed_csp->{'_min_identity'} );
+    $cross->{'_min_overlap'} = $mixed_csp->{'_min_overlap'};
   }
+  if ($self->{'_min_identity'}) {
+    $cross->{'_min_identity'} = $self->{'_min_identity'};
+  } else {
+    $cross->{'_min_identity'} = $mixed_csp->{'_min_identity'};
+  }
 
   # Get cross contig spectrum for each assembly
   my $spectrum = {1 => 0};
@@ -1617,7 +1626,13 @@
   Title   : _get_contig_overlap_stats
   Usage   : my ($avglength, $avgidentity, $minlength, $min_identity, $nof_overlaps)
               = $csp->_get_contig_overlap_stats($contigobj);
-  Function: Get statistics about pairwise overlaps in a contig or singlet
+  Function: Get statistics about pairwise overlaps in a contig or singlet. The
+              statistics are obtained using graph theory: each read is a node
+              and the edges between 2 reads are weighted by minus the number of
+              conserved residues in the alignment between the 2 reads. The
+              minimum spanning tree of this graph represents the overlaps that
+              form the contig. Overlaps that do not satisfy the minimum overlap
+              length and similarity get a malus on their score.
   Returns : average overlap length
             average identity percent
             minimum overlap length
@@ -1666,12 +1681,24 @@
       my $target_id = $target_obj->id;
       next if defined $seq_hash && !defined $$seq_hash{$target_id};
 
-      # Score the overlap with this sequence
+      # How much overlap with this sequence?
       my ($aln_obj, $length, $identity)
         = $self->_overlap_alignment($contig_obj, $seq_obj, $target_obj);
       next if ! defined $aln_obj; # there was no sequence overlap
-      my $score = $length * $identity / 100; # number of conserved residues
 
+      # Score the overlap as the number of conserved residues. In practice, it
+      # seems to work better than giving +1 for match and -3 for errors
+      # (mismatch or indels)
+      my $score = $length * $identity / 100;
+
+      # Apply a malus (square root) for scores that do not satisfy the minimum
+      # overlap length similarity. It is necessary for overlaps that get a high
+      # score without satisfying both the minimum values.
+      if ( ( $self->min_overlap  && ($length   < $self->min_overlap ) ) ||
+           ( $self->min_identity && ($identity < $self->min_identity) ) ) {
+        $score = sqrt($score);
+      }
+
       # Record overlap
       $overlaps{$i}{$j} = [$seq_id, $target_id, $score, $length, $identity];
     }
@@ -1863,6 +1890,7 @@
   # check overlap percentage identity
   my $identity = $aln->overall_percentage_identity;
   return if defined $min_identity && $identity < $min_identity;
+
   # all checks passed, return alignment
   return $aln, $overlap, $identity;
 }



More information about the Bioperl-guts-l mailing list