[Bioperl-guts-l] [16619] bioperl-live/trunk/Bio/Assembly/Tools/ContigSpectrum.pm: Improvements in the calculation of the contig spectrum effective assembly parameters
Florent E Angly
fangly at dev.open-bio.org
Thu Jan 7 19:38:37 EST 2010
Revision: 16619
Author: fangly
Date: 2010-01-07 19:38:37 -0500 (Thu, 07 Jan 2010)
Log Message:
-----------
Improvements in the calculation of the contig spectrum effective assembly parameters
Modified Paths:
--------------
bioperl-live/trunk/Bio/Assembly/Tools/ContigSpectrum.pm
Modified: bioperl-live/trunk/Bio/Assembly/Tools/ContigSpectrum.pm
===================================================================
--- bioperl-live/trunk/Bio/Assembly/Tools/ContigSpectrum.pm 2010-01-07 22:36:57 UTC (rev 16618)
+++ bioperl-live/trunk/Bio/Assembly/Tools/ContigSpectrum.pm 2010-01-08 00:38:37 UTC (rev 16619)
@@ -975,6 +975,8 @@
# 2: Set overlap statistics: nof_overlaps, min_overlap, avg_overlap,
# min_identity and avg_identity
$csp->{'_eff_asm_params'} = $self->{'_eff_asm_params'};
+ $csp->{'_min_overlap'} = $self->{'_min_overlap'};
+ $csp->{'_min_identity'} = $self->{'_min_identity'};
if ($csp->{'_eff_asm_params'} > 0) {
( $csp->{'_avg_overlap'}, $csp->{'_avg_identity'}, $csp->{'_min_overlap'},
$csp->{'_min_identity'}, $csp->{'_nof_overlaps'} )
@@ -1026,6 +1028,8 @@
# 2: Set overlap statistics: nof_overlaps, min_overlap, avg_overlap,
# min_identity and avg_identity
$csp->{'_eff_asm_params'} = $self->{'_eff_asm_params'};
+ $csp->{'_min_overlap'} = $self->{'_min_overlap'};
+ $csp->{'_min_identity'} = $self->{'_min_identity'};
if ($csp->{'_eff_asm_params'} > 0) {
( $csp->{'_avg_overlap'}, $csp->{'_avg_identity'}, $csp->{'_min_overlap'},
$csp->{'_min_identity'}, $csp->{'_nof_overlaps'} )
@@ -1053,7 +1057,6 @@
Returns :
Args :
-
=cut
sub _new_dissolved_csp {
@@ -1085,19 +1088,22 @@
# New dissolved contig spectrum object
my $dissolved = Bio::Assembly::Tools::ContigSpectrum->new();
- # take parent attributes if existent or mixed attributes otherwise
+ # Take parent attributes if existent or mixed attributes otherwise
if ($self->{'_eff_asm_params'}) {
$dissolved->{'_eff_asm_params'} = $self->{'_eff_asm_params'};
} else {
$dissolved->{'_eff_asm_params'} = $mixed_csp->{'_eff_asm_params'};
}
- if ($self->{'_min_overlap'} && $self->{'_min_identity'}) {
- ( $dissolved->{'_min_overlap'}, $dissolved->{'_min_identity'} ) =
- ( $self->{'_min_overlap'}, $self->{'_min_identity'} );
+ if ($self->{'_min_overlap'}) {
+ $dissolved->{'_min_overlap'} = $self->{'_min_overlap'};
} else {
- ( $dissolved->{'_min_overlap'}, $dissolved->{'_min_identity'} ) =
- ( $mixed_csp->{'_min_overlap'}, $mixed_csp->{'_min_identity'} );
+ $dissolved->{'_min_overlap'} = $mixed_csp->{'_min_overlap'};
}
+ if ($self->{'_min_identity'}) {
+ $dissolved->{'_min_identity'} = $self->{'_min_identity'};
+ } else {
+ $dissolved->{'_min_identity'} = $mixed_csp->{'_min_identity'};
+ }
# Dissolve each assembly
my $asm_spectrum = { 1 => 0 };
@@ -1220,13 +1226,16 @@
} else {
$cross->{'_eff_asm_params'} = $mixed_csp->{'_eff_asm_params'};
}
- if ($self->{'_min_overlap'} && $self->{'_min_identity'}) {
- ( $cross->{'_min_overlap'}, $cross->{'_min_identity'} ) =
- ( $self->{'_min_overlap'}, $self->{'_min_identity'} );
+ if ($self->{'_min_overlap'}) {
+ $cross->{'_min_overlap'} = $self->{'_min_overlap'};
} else {
- ( $cross->{'_min_overlap'}, $cross->{'_min_identity'} ) =
- ( $mixed_csp->{'_min_overlap'}, $mixed_csp->{'_min_identity'} );
+ $cross->{'_min_overlap'} = $mixed_csp->{'_min_overlap'};
}
+ if ($self->{'_min_identity'}) {
+ $cross->{'_min_identity'} = $self->{'_min_identity'};
+ } else {
+ $cross->{'_min_identity'} = $mixed_csp->{'_min_identity'};
+ }
# Get cross contig spectrum for each assembly
my $spectrum = {1 => 0};
@@ -1617,7 +1626,13 @@
Title : _get_contig_overlap_stats
Usage : my ($avglength, $avgidentity, $minlength, $min_identity, $nof_overlaps)
= $csp->_get_contig_overlap_stats($contigobj);
- Function: Get statistics about pairwise overlaps in a contig or singlet
+ Function: Get statistics about pairwise overlaps in a contig or singlet. The
+ statistics are obtained using graph theory: each read is a node
+ and the edges between 2 reads are weighted by minus the number of
+ conserved residues in the alignment between the 2 reads. The
+ minimum spanning tree of this graph represents the overlaps that
+ form the contig. Overlaps that do not satisfy the minimum overlap
+ length and similarity get a malus on their score.
Returns : average overlap length
average identity percent
minimum overlap length
@@ -1666,12 +1681,24 @@
my $target_id = $target_obj->id;
next if defined $seq_hash && !defined $$seq_hash{$target_id};
- # Score the overlap with this sequence
+ # How much overlap with this sequence?
my ($aln_obj, $length, $identity)
= $self->_overlap_alignment($contig_obj, $seq_obj, $target_obj);
next if ! defined $aln_obj; # there was no sequence overlap
- my $score = $length * $identity / 100; # number of conserved residues
+ # Score the overlap as the number of conserved residues. In practice, it
+ # seems to work better than giving +1 for match and -3 for errors
+ # (mismatch or indels)
+ my $score = $length * $identity / 100;
+
+ # Apply a malus (square root) for scores that do not satisfy the minimum
+ # overlap length similarity. It is necessary for overlaps that get a high
+ # score without satisfying both the minimum values.
+ if ( ( $self->min_overlap && ($length < $self->min_overlap ) ) ||
+ ( $self->min_identity && ($identity < $self->min_identity) ) ) {
+ $score = sqrt($score);
+ }
+
# Record overlap
$overlaps{$i}{$j} = [$seq_id, $target_id, $score, $length, $identity];
}
@@ -1863,6 +1890,7 @@
# check overlap percentage identity
my $identity = $aln->overall_percentage_identity;
return if defined $min_identity && $identity < $min_identity;
+
# all checks passed, return alignment
return $aln, $overlap, $identity;
}
More information about the Bioperl-guts-l
mailing list