[Bioperl-guts-l] [15074] bioperl-live/trunk: Add method that finds the most common codons for the aa's, uses those to reverse-translate a nucleotide sequence

Brian Osborne bosborne at dev.open-bio.org
Tue Dec 2 23:57:50 EST 2008


Revision: 15074
Author:   bosborne
Date:     2008-12-02 23:57:49 -0500 (Tue, 02 Dec 2008)

Log Message:
-----------
Add method that finds the most common codons for the aa's, uses those to reverse-translate a nucleotide sequence

Modified Paths:
--------------
    bioperl-live/trunk/Bio/CodonUsage/Table.pm
    bioperl-live/trunk/Bio/Tools/CodonTable.pm
    bioperl-live/trunk/t/CodonTable.t

Modified: bioperl-live/trunk/Bio/CodonUsage/Table.pm
===================================================================
--- bioperl-live/trunk/Bio/CodonUsage/Table.pm	2008-12-03 03:40:20 UTC (rev 15073)
+++ bioperl-live/trunk/Bio/CodonUsage/Table.pm	2008-12-03 04:57:49 UTC (rev 15074)
@@ -27,7 +27,7 @@
                                    -gc => 1);
 
   # Or from local file
-  my $io      = Bio::CodonUsage::IO->new(-file=>"file");
+  my $io      = Bio::CodonUsage::IO->new(-file => "file");
   my $cdtable = $io->next_data();
 
   # Or create your own from a Bio::PrimarySeq compliant object,
@@ -275,7 +275,45 @@
 }
 		
 
+=head2 most_common_codons
 
+ Title    : most_common_codons
+ Usage    : my $common_codons = $cd_table->most_common_codons();
+ Purpose  : To obtain the most common codon for a given amino acid
+ Returns  : A reference to a hash where keys are 1 letter amino acid codes
+            and the values are the single most common codons for those amino acids
+ Arguments: None
+
+=cut
+
+sub most_common_codons {
+	my $self = shift;
+
+	my %return_hash;
+
+	for my $a ( keys %STRICTAA ) {
+
+		my $common_codon = '';
+		my $rel_freq = 0;
+		my $aa = $Bio::SeqUtils::THREECODE{$a};
+
+		if ( ! defined $self->{'_table'}{$aa} ) {
+			$self->warn("Amino acid $aa ($a) does not have any codons!");
+			next;
+		}
+
+		for my $codon ( keys %{ $self->{'_table'}{$aa}} ) {
+			if ($self->{'_table'}{$aa}{$codon}{'rel_freq'} > $rel_freq ){
+				$common_codon = $codon;
+				$rel_freq = $self->{'_table'}{$aa}{$codon}{'rel_freq'};
+			}
+		}
+		$return_hash{$a} = $common_codon;
+	}
+   
+	return \%return_hash;
+}
+
 =head2 codon_count
 
  Title   : codon_count

Modified: bioperl-live/trunk/Bio/Tools/CodonTable.pm
===================================================================
--- bioperl-live/trunk/Bio/Tools/CodonTable.pm	2008-12-03 03:40:20 UTC (rev 15073)
+++ bioperl-live/trunk/Bio/Tools/CodonTable.pm	2008-12-03 04:57:49 UTC (rev 15074)
@@ -20,7 +20,6 @@
   # the ones used by nucleotide sequence databases.  All common IUPAC
   # ambiguity codes for DNA, RNA and amino acids are recognized.
 
-  # to use
   use Bio::Tools::CodonTable;
 
   # defaults to ID 1 "Standard"
@@ -57,9 +56,9 @@
   my $seqobj    = Bio::PrimarySeq->new(-seq => 'FHGERHEL');
   my $iupac_str = $myCodonTable->reverse_translate_all($seqobj);
 
-  #boolean tests
+  # boolean tests
   print "Is a start\n"       if $myCodonTable->is_start_codon('ATG');
-  print "Is a termianator\n" if $myCodonTable->is_ter_codon('tar');
+  print "Is a terminator\n" if $myCodonTable->is_ter_codon('tar');
   print "Is a unknown\n"     if $myCodonTable->is_unknown_codon('JTG');
 
 =head1 DESCRIPTION
@@ -172,8 +171,8 @@
 # Let the code begin...
 
 package Bio::Tools::CodonTable;
-use vars qw(@NAMES @TABLES @STARTS $TRCOL $CODONS %IUPAC_DNA 	    $CODONGAP $GAP
-	    %IUPAC_AA %THREELETTERSYMBOLS $VALID_PROTEIN $TERMINATOR);
+use vars qw(@NAMES @TABLES @STARTS $TRCOL $CODONS %IUPAC_DNA $CODONGAP $GAP
+				%IUPAC_AA %THREELETTERSYMBOLS $VALID_PROTEIN $TERMINATOR);
 use strict;
 
 # Object preamble - inherits from Bio::Root::Root
@@ -298,14 +297,10 @@
 
  Title   : id
  Usage   : $obj->id(3); $id_integer = $obj->id();
- Function:
-
-           Sets or returns the id of the translation table.  IDs are
+ Function: Sets or returns the id of the translation table.  IDs are
            integers from 1 to 15, excluding 7 and 8 which have been
            removed as redundant. If an invalid ID is given the method
            returns 0, false.
-
-
  Example :
  Returns : value of id, a scalar, 0 if not a valid
  Args    : newvalue (optional)
@@ -537,38 +532,39 @@
 =cut
 
 sub revtranslate {
-    my ($self, $value, $coding) = @_;
-    my ($id) = $self->{'id'};
-    my (@aas,  $p);
-    my (@codons) = ();
+	my ($self, $value, $coding) = @_;
+	my ($id) = $self->{'id'};
+	my (@aas,  $p);
+	my (@codons) = ();
 
-    if (length($value) == 3 ) {
-	$value = lc $value;
-	$value = ucfirst $value;
-	$value = $THREELETTERSYMBOLS{$value};
-    }
-    if ( defined $value and $value =~ /$VALID_PROTEIN/ 
-	 and length($value) == 1 ) {
-	$value = uc $value;
-	@aas = @{$IUPAC_AA{$value}};	
-	foreach my $aa (@aas) {
-	    #print $aa, " -2\n";
-	    $aa = '\*' if $aa eq '*';
-	    while ($TABLES[$id-1] =~ m/$aa/g) {
-		$p = pos $TABLES[$id-1];
-		push (@codons, $TRCOL->{--$p});
+	if (length($value) == 3 ) {
+		$value = lc $value;
+		$value = ucfirst $value;
+		$value = $THREELETTERSYMBOLS{$value};
+	}
+	if ( defined $value and $value =~ /$VALID_PROTEIN/ 
+		  and length($value) == 1 ) {
+		$value = uc $value;
+		@aas = @{$IUPAC_AA{$value}};	
+		foreach my $aa (@aas) {
+			#print $aa, " -2\n";
+			$aa = '\*' if $aa eq '*';
+	      while ($TABLES[$id-1] =~ m/$aa/g) {
+		      $p = pos $TABLES[$id-1];
+		      push (@codons, $TRCOL->{--$p});
+	      }
 	    }
-	}
     }
 
-    if ($coding and uc ($coding) eq 'RNA') {
-	for my $i (0..$#codons)  {
-	    $codons[$i] =~ tr/t/u/;
-	}
-    }
+   if ($coding and uc ($coding) eq 'RNA') {
+	   for my $i (0..$#codons)  {
+	      $codons[$i] =~ tr/t/u/;
+	   }
+   }
+    
+   return @codons;
+}
 
-    return @codons;
-}
 =head2 reverse_translate_all
 
  Title   : reverse_translate_all
@@ -583,16 +579,14 @@
  Args    : a Bio::PrimarySeqI compatible object (mandatory)
            a Bio::CodonUsage::Table object and a threshold if only
              codons with a relative frequency above the threshold are
-             to be considered. 
-
-
+             to be considered.
 =cut
 
 sub reverse_translate_all {
 	
 	my ($self, $obj, $cut, $threshold) = @_;
 
-    ## check args are OK
+	## check args are OK
 
 	if (!$obj || !$obj->isa('Bio::PrimarySeqI')){
 		$self->throw(" I need a Bio::PrimarySeqI object, not a [".
@@ -637,6 +631,53 @@
 
 }
 
+=head2 reverse_translate_best
+
+ Title   : reverse_translate_best
+ Usage   : my $str = $cttable->reverse_translate_best($seq_object,$cutable);
+ Function: Reverse translates a protein sequence into plain nucleotide
+           sequence (GATC), uses the most common codon for each amino acid
+ Returns : A string
+ Args    : A Bio::PrimarySeqI compatible object and a Bio::CodonUsage::Table object
+
+=cut
+
+sub reverse_translate_best {
+
+	my ($self, $obj, $cut) = @_;
+
+	if (!$obj || !$obj->isa('Bio::PrimarySeqI')){
+		$self->throw(" I need a Bio::PrimarySeqI object, not a [".
+						 ref($obj) . "]");
+	}
+	if ($obj->alphabet ne 'protein')	{
+		$self->throw("Cannot reverse translate, need an amino acid sequence .".
+						 "This sequence is of type [" . $obj->alphabet ."]");
+	}
+	if ( !$cut | !$cut->isa('Bio::CodonUsage::Table'))	{
+		$self->throw("I need a Bio::CodonUsage::Table object, not a [".
+						 ref($cut). "].");
+	}
+
+	my $str = '';
+	my @seq = split '', $obj->seq;
+
+	my $cod_ref = $cut->most_common_codons();
+
+	for my $aa ( @seq ) {
+		if ($aa =~ /x/i) {
+			$str .= 'NNN';
+			next;
+		}
+		if ( defined $cod_ref->{$aa} ) {
+			$str .= $cod_ref->{$aa};
+		} else {
+			$self->throw("Input sequence contains invalid character: $aa");			
+		}
+	}
+   $str;
+}
+
 =head2 is_start_codon
 
  Title   : is_start_codon
@@ -647,7 +688,6 @@
  Returns : boolean
  Args    : codon
 
-
 =cut
 
 sub is_start_codon{
@@ -682,7 +722,6 @@
  Returns : boolean
  Args    : codon
 
-
 =cut
 
 sub is_ter_codon{

Modified: bioperl-live/trunk/t/CodonTable.t
===================================================================
--- bioperl-live/trunk/t/CodonTable.t	2008-12-03 03:40:20 UTC (rev 15073)
+++ bioperl-live/trunk/t/CodonTable.t	2008-12-03 04:57:49 UTC (rev 15074)
@@ -7,9 +7,10 @@
     use lib 't/lib';
 	use BioperlTest;
 	
-	test_begin(-tests => 51);
+	test_begin(-tests => 56);
     
 	use_ok('Bio::Tools::CodonTable');
+	use_ok('Bio::CodonUsage::IO');
 }
 
 # create a table object by giving an ID
@@ -179,7 +180,16 @@
 			                   -alphabet => 'dna');
 is $seq->translate->seq, 'M-K--N';
 
-ok $seq = Bio::PrimarySeq->new(-seq=>'ASDFGHKL');
+ok $seq = Bio::PrimarySeq->new(-seq =>'ASDFGHKL');
 is $myCodonTable->reverse_translate_all($seq), 'GCBWSNGAYTTYGGVCAYAARYTN';
-ok $seq = Bio::PrimarySeq->new(-seq=>'ASXFHKL');
+ok $seq = Bio::PrimarySeq->new(-seq => 'ASXFHKL');
 is $myCodonTable->reverse_translate_all($seq), 'GCBWSNNNNTTYCAYAARYTN';
+
+#
+# test reverse_translate_best(), requires a Bio::CodonUsage::Table object
+# 
+
+ok $seq = Bio::PrimarySeq->new(-seq =>'ACDEFGHIKLMNPQRSTVWY');
+ok my $io = Bio::CodonUsage::IO->new(-file => test_input_file('MmCT'));
+ok my $cut = $io->next_data();
+is $myCodonTable->reverse_translate_best($seq,$cut), 'GCCTGCGACGAGTTCGGCCACATCAAGCTGATGAACCCCCAGCGCTCCACCGTGTGGTAC';




More information about the Bioperl-guts-l mailing list