[Bioperl-guts-l] [14442] bioperl-live/trunk: first version of Gerp parser, usable

Senduran Balasubramaniam sendu at dev.open-bio.org
Wed Jan 16 05:51:53 EST 2008


Revision: 14442
Author:   sendu
Date:     2008-01-16 05:51:53 -0500 (Wed, 16 Jan 2008)

Log Message:
-----------
first version of Gerp parser, usable

Added Paths:
-----------
    bioperl-live/trunk/Bio/Tools/Phylo/Gerp.pm
    bioperl-live/trunk/t/data/ENr111.mfa.example.elems
    bioperl-live/trunk/t/gerp.t

Added: bioperl-live/trunk/Bio/Tools/Phylo/Gerp.pm
===================================================================
--- bioperl-live/trunk/Bio/Tools/Phylo/Gerp.pm	                        (rev 0)
+++ bioperl-live/trunk/Bio/Tools/Phylo/Gerp.pm	2008-01-16 10:51:53 UTC (rev 14442)
@@ -0,0 +1,147 @@
+# $Id: Gumby.pm,v 1.2 2007/06/14 18:01:52 nathan Exp $
+#
+# BioPerl module for Bio::Tools::Phylo::Gerp
+#
+# Cared for by Sendu Bala <bix at sendu.me.uk>
+#
+# Copyright Sendu Bala
+#
+# You may distribute this module under the same terms as perl itself
+
+# POD documentation - main docs before the code
+
+=head1 NAME
+
+Bio::Tools::Phylo::Gerp - Parses output from GERP
+
+=head1 SYNOPSIS
+
+  use strict;
+
+  use Bio::Tools::Phylo::Gerp;
+
+  my $parser = Bio::Tools::Phylo::Gerp->new(-file => "alignment.rates.elems");
+  
+  while (my $feat = $parser->next_result) {
+    my $start = $feat->start;
+    my $end = $feat->end;
+    my $rs_score = $feat->score;
+    my $p_value = ($feat->annotation->get_Annotations('p-value'))[0]->value;
+  }
+
+=head1 DESCRIPTION
+
+This module is used to parse the output from 'GERP' (v2) by Eugene Davydov
+(originally Gregory M. Cooper et al.). You can get details here:
+http://mendel.stanford.edu/sidowlab/
+
+It works on the .elems files produced by gerpelem.
+
+Each result is a Bio::SeqFeature::Annotated representing a single contrained
+element.
+
+=head1 FEEDBACK
+
+=head2 Mailing Lists
+
+User feedback is an integral part of the evolution of this and other
+Bioperl modules. Send your comments and suggestions preferably to
+the Bioperl mailing list. Your participation is much appreciated.
+
+  bioperl-l at bioperl.org                  - General discussion
+  http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
+
+=head2 Reporting Bugs
+
+Report bugs to the Bioperl bug tracking system to help us keep track
+of the bugs and their resolution. Bug reports can be submitted via the
+web:
+
+  http://bugzilla.open-bio.org/
+
+=head1 AUTHOR - Sendu Bala
+
+Email bix at sendu.me.uk
+
+=head1 APPENDIX
+
+The rest of the documentation details each of the object methods.
+Internal methods are usually preceded with a _
+
+=cut
+
+# Let the code begin...
+
+package Bio::Tools::Phylo::Gerp;
+use strict;
+
+use Bio::SeqFeature::Annotated;
+use Bio::Annotation::SimpleValue;
+
+use base qw(Bio::Root::Root Bio::Root::IO);
+
+
+=head2 new
+
+ Title   : new
+ Usage   : my $obj = Bio::Tools::Phylo::Gerp->new();
+ Function: Builds a new Bio::Tools::Phylo::Gerp object
+ Returns : Bio::Tools::Phylo::Gerp
+ Args    : -file (or -fh) should contain the contents of a gerpelem .elems file
+
+=cut
+
+sub new {
+    my ($class, @args) = @_;
+    my $self = $class->SUPER::new(@args);
+    
+    $self->_initialize_io(@args);
+    
+    return $self;
+}
+
+=head2 next_result
+
+ Title   : next_result
+ Usage   : $result = $obj->next_result();
+ Function: Returns the next result available from the input, or undef if there
+           are no more results.
+ Returns : Bio::SeqFeature::Annotated object. Features are annotated with a tag
+           for 'pvalue', and a 'predicted' tag. They have no sequence id unless
+           the input GERP file is non-standard, with the seq id as the 6th
+           column.
+ Args    : none
+
+=cut
+
+sub next_result {
+    my ($self) = @_;
+    
+    my $line = $self->_readline || return;
+    
+    while ($line !~ /^\d+\s+\d+\s+\d+\s+\S+\s+\S+\s*(?:\S+\s*)?$/) {
+        $line = $self->_readline || return;
+    }
+    
+    #start   end     length  RS-score        p-value
+    # code elsewhere adds seq_id on the end (not valid GERP), so we capture that
+    # if present
+    my ($start, $end, undef, $rs_score, $p_value, $seq_id) = split(/\s+/, $line);
+    my $feat = Bio::SeqFeature::Annotated->new(
+        $seq_id ? (-seq_id => $seq_id) : (),
+        -start        => $start, 
+        -end          => $end,
+        -strand       => 1,
+        -score        => $rs_score,
+        #-type         => 'conserved_region', ***causes 740x increase in SeqFeatureDB storage requirments!
+        -source       => 'GERP');
+    
+    my $sv = Bio::Annotation::SimpleValue->new(-tagname => 'predicted', -value => 1);
+    $feat->annotation->add_Annotation($sv);
+    $sv = Bio::Annotation::SimpleValue->new(-tagname => 'pvalue', -value => $p_value);
+    $feat->annotation->add_Annotation($sv);
+    
+    return $feat;
+}
+
+1;


Property changes on: bioperl-live/trunk/Bio/Tools/Phylo/Gerp.pm
___________________________________________________________________
Name: svn:executable
   + *

Added: bioperl-live/trunk/t/data/ENr111.mfa.example.elems
===================================================================
--- bioperl-live/trunk/t/data/ENr111.mfa.example.elems	                        (rev 0)
+++ bioperl-live/trunk/t/data/ENr111.mfa.example.elems	2008-01-16 10:51:53 UTC (rev 14442)
@@ -0,0 +1,10 @@
+## Note that element detection relies on computing a false positive rate that uses random permutations of the original data.  
+## Due to the stochastic nature of the permutations, you may see subtle differences from run to run.  
+## Do not expect these test results to be precisely what you will get when you do your own run on the test dataset. 
+
+500000 3.85
+334180 334352 173 449 1.03744e-165
+337735 337915 181 458.2 5.02405e-164
+262604 262861 258 473.1 3.64789e-117
+285427 285608 182 386.1 8.42494e-113
+309563 309744 182 383.6 2.88895e-111

Added: bioperl-live/trunk/t/gerp.t
===================================================================
--- bioperl-live/trunk/t/gerp.t	                        (rev 0)
+++ bioperl-live/trunk/t/gerp.t	2008-01-16 10:51:53 UTC (rev 14442)
@@ -0,0 +1,36 @@
+# -*-Perl-*- Test Harness script for Bioperl
+# $Id: gerp.t,v 1.15 2007/06/27 10:16:38 sendu Exp $
+
+use strict;
+
+BEGIN {
+    use lib 't/lib';
+    use BioperlTest;
+    
+    test_begin(-tests => 33);
+	
+    use_ok('Bio::Tools::Phylo::Gerp');
+}
+
+ok my $parser = Bio::Tools::Phylo::Gerp->new(-file => test_input_file('ENr111.mfa.example.elems'));
+
+my $count = 0;
+my @expected = ([qw(334180 334352 449 1.03744e-165)],
+                [qw(337735 337915 458.2 5.02405e-164)],
+                [qw(262604 262861 473.1 3.64789e-117)],
+                [qw(285427 285608 386.1 8.42494e-113)],
+                [qw(309563 309744 383.6 2.88895e-111)]);
+while (my $feat = $parser->next_result) {
+    $count++;
+    my @exp = @{shift(@expected)};
+    
+    isa_ok $feat, 'Bio::SeqFeature::Annotated';
+    is $feat->source->value, 'GERP', 'correct source';
+    is $feat->start, shift(@exp), 'feature start correct';
+    is $feat->end, shift(@exp), 'feature end correct';
+    is $feat->score, shift(@exp), 'feature score correct';
+    my ($p_value) = $feat->get_Annotations('pvalue');
+    is ref $p_value ? $p_value->value : $p_value, shift(@exp), 'feature pvalue correct';
+}
+
+is $count, 5, "correct number of results parsed out";




More information about the Bioperl-guts-l mailing list